diff --git a/OWNERS b/OWNERS index 4bea7be7b2b6746d39f340a7cb07dbe27a9e80e1..89f1e2bfa3d8e3297e9f41aa4b3ba9892d86807b 100644 --- a/OWNERS +++ b/OWNERS @@ -2,6 +2,8 @@ approvers: - leo920320 - wo-wenjie - ma-dongfang +- xhahn +- aerfaliang reviewers: - leo920320 - wo-wenjie diff --git a/debug/tools/merge_profiling_timeline/README.md b/debug/tools/merge_profiling_timeline/README.md index b9df3f24ef38b459a5acdb28ae160bc440b6e393..5a899872a708786cdd7142d210b998c71e715ffb 100644 --- a/debug/tools/merge_profiling_timeline/README.md +++ b/debug/tools/merge_profiling_timeline/README.md @@ -3,102 +3,11 @@ ## 介绍 本工具支持合并profiling的timeline数据,支持合并指定rank的timline、合并指定timeline中的item -## 软件架构 -软件架构说明 -## 使用说明 -> 合并单机多卡timeline时,不涉及多机间时间校准,直接忽略第1节内容 -### 1 获取服务器与客户端的启动时间差值文件time_difference.json +## 1 多timeline融合(常规) - 以下代码中,服务器为当前代码所在机器,所有步骤都在服务器上操作,可以选择集群中任意节点作为服务器 - -#### 1.1 拉取代码到集群任意节点任意目录,进入get_nodes_timediff目录 - - ```shell - git clone https://gitee.com/aerfaliang/merge_profiling_timeline.git - cd merge_profiling_timeline/get_nodes_timediff - ``` - -#### 1.2 安装依赖sshpass(1.4中远程执行客户端命令时要用到) - -##### Ubuntu - - ```shell - apt-get install sshpass - ``` - -##### CentOS - - ```shell - # 源码包安装 - wget http://sourceforge.net/projects/sshpass/files/sshpass/1.05/sshpass-1.05.tar.gz - tar -xvzf sshpass-1.05.tar.gz - cd sshpass-1.05 - ./configure - make - make install - - # yum安装 - yum -y install sshpass - ``` - -#### 1.3 按照集群节点顺序编辑nodeinfo.json文件 - - 文件中的内容为全部节点的ip,用户名,密码以及端口 - - 注意:必须按照集群节点顺序填写,以下为样例文件中内容,请根据实际节点信息修改,将集群中所有节点信息填入 - - ```shell - root@root:~/test/merge_profiling_timeline/get_nodes_timediff$ cat nodeinfo.json - { - "cluster": { - "90.90.66.62": { - "user": "dcs-62", - "pd": "password", - "port": 22 - }, - "90.90.66.64": { - "user": "dcs-64", - "pd": "password", - "port": 22 - - } - } - } - - ``` - -#### 1.4 执行get_nodes_timediff.sh脚本,获取服务器与客户端的启动时间差值文件time_difference.json - - ```shell - # bash get_nodes_timediff.sh {当前机器ip} - bash get_nodes_timediff.sh 90.90.66.62 - ``` - -#### 1.5 检查在脚本同目录有time_difference.json文件生成 - - 文件中记录的是节点ip以及客户端相对服务器启动时间差(客户端当前启动时间-服务器启动时间),按照集群节点顺序排列 - - ```shell - root@root:~/test/merge_profiling_timeline/get_nodes_timediff$ cat time_difference.json - { - "90.90.66.62": -3.8049183785915375e-06, - "90.90.66.64": -1.551163767464459 - } - ``` - -### 2 多timeline数据合并 - -#### 2.1 使用msporf采集数据,将采集到的所有节点的profiling数据拷贝到当前机器同一目录下,以下假设数据在/home/test/all_cann_profiling下 - -#### 2.2 使用merge_profiling_timeline下的main.py合并timeline - -可选参数: -- -d: **必选参数**,profiling数据文件或文件夹路径 -- -t: **当需要融合节点间timeline时为必选参数**,传入的为前面步骤1.3生成的time_difference.json文件路径 -- -o: 可选参数,指定合并后的timeline文件输出的路径,默认为'-d'输入的路径 -- --rank:可选参数,指定需要合并timeline的卡号,默认全部合并 -- --items:可选参数,指定需要合并的profiling数据项,默认全部合并 +### 1.1 数据采集 +使用msporf采集数据,将采集到的所有节点的profiling数据拷贝到当前机器同一目录下,以下假设数据在/home/test/cann_profiling下 profiling数据目录结构示意, 合并timeline必需数据:`msprof.json`和`info.json.*`: ``` @@ -113,6 +22,17 @@ profiling数据目录结构示意, 合并timeline必需数据:`msprof.json`和 ... ``` +### 1.2 合并timeline + +可选参数: +- -d: **必选参数**,profiling数据文件或文件夹路径 +- -t: **当需要融合多级多卡timeline时需要校准多机间的时间**,传入时间校准的time_difference.json文件路径, 该文件的获取参考[节点间时间差获取](https://gitee.com/aerfaliang/merge_profiling_timeline/tree/master/get_nodes_timediff) +- -o: 可选参数,指定合并后的timeline文件输出的路径,默认为'-d'输入的路径 +- --rank:可选参数,指定需要合并timeline的卡号,默认全部合并 +- --items:可选参数,指定需要合并的profiling数据项,默认全部合并 + + + **使用示例**: @@ -134,16 +54,45 @@ python3 main.py -d path/to/cann_profiling/ --items CANN,Ascend_Hardware 4、合并多机多卡的timeline时, 需要-t指定节点间的时间误差文件路径, 用以校准节点间的时间: - ``` python3 main.py -d path/to/cann_profiling/ -t path/to/time_difference.json --rank 0,8, ``` 合并timeline查看: -> 查看在 -o 指定的目录(默认在-d指定的目录下)生辰过的msprof_merged_*p.json,为合并后的文件 +> 在 -o 指定的目录(默认在-d指定的目录下)的msprof_merged_*p.json为合并后的文件 + +## 2 多timeline融合(自定义) +### 2.1 数据采集 +将需要合并的timeline文件全部放在同一目录下 +数据目录结构示意, 合并timeline必需数据:`msprof.json`和`info.json.*`: +``` +|- timeline + |- msprof_0.json + |- msprof_1.json + |- msprof_2.json + |- msprof_3.json + |- step_trace_0.json + |- step_trace_1.json + |- step_trace_2.json + |- step_trace_3.json + ... +``` +### 2.2 合并timeline +使用脚本`merge_profiling_timeline/main.py`合并timeline +可选参数: +- -d: **必选参数**,指定profiling数据文件或文件夹路径 +- -o: 可选参数,指定合并后的timeline文件输出的路径,默认为'-d'输入的路径 +- --custom: **必选参数**,工具通过该参数识别为自定义融合场景 +**使用示例**: + +将需要合并的所有timeline放在同一目录下,通过下面的命令合并所有timeline +``` +python3 main.py -d path/to/timeline/ --custom +``` +合并timeline查看:同 -### 3、超大timeline文件查看 +## 3 超大timeline文件查看 直接使用以下命令 ``` diff --git a/debug/tools/merge_profiling_timeline/main.py b/debug/tools/merge_profiling_timeline/main.py index 7cd9bd45a6dbaf6a96ee90790f4d373881d50ce7..dfa90eb05d265ffc796833452a00bb4c0d8e4748 100644 --- a/debug/tools/merge_profiling_timeline/main.py +++ b/debug/tools/merge_profiling_timeline/main.py @@ -15,6 +15,7 @@ import json import os +import re from functools import partial from argparse import ArgumentParser @@ -55,6 +56,12 @@ def _path_dir_filter_func(sub_path, root_dir): return sub_path not in FILTER_DIRS and os.path.isdir(os.path.realpath(os.path.join(root_dir, sub_path))) +def natural_sort(files): + convert = lambda text: int(text) if text.isdigit() else text.lower() + alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] + return sorted(files, key=alphanum_key) + + def get_timeline_info(input_path, prof_dirs): timeline_info = {} @@ -91,14 +98,14 @@ def get_rank_id_from_info_json(pro_path): return rank_id -def merge_timeline(timeline_info, args): - """merge timeline of PROF_""" - new_events = [] +def merge_timeline_general(timeline_info, args): + + timeline_files_dict = {} node_time_diff = get_node_time_diff(args.timediff) if args.timediff else None # 合并部分profiling items - merge_items = args.items.split(",") if args.items else None + process_list = args.items.split(",") if args.items else None # 合并部分rank if args.rank: @@ -107,60 +114,89 @@ def merge_timeline(timeline_info, args): rank_ids = list(timeline_info.keys()) for rank_id in rank_ids: - timeline_file = timeline_info.get(rank_id) + timeline_files_dict[rank_id] = timeline_info.get(rank_id) + merge_timeline_events(timeline_files_dict, process_list, node_time_diff) + + +def merge_timeline_custom(args): + """合并指定目录里所有timeline文件""" + timeline_files = natural_sort(os.listdir(args.data)) + timeline_files_dict = {} + for idx, timeline_file in enumerate(timeline_files): + timeline_files_dict[idx] = os.path.join(args.data, timeline_file) + node_time_diff = get_node_time_diff(args.timediff) if args.timediff else None + # 合并部分profiling items + process_list = args.items.split(",") if args.items else None + merge_timeline_events(timeline_files_dict, process_list, node_time_diff) + + +def merge_timeline_events(timeline_file_dict, process_list, node_time_diff=None): + """ + 输入需要合并的timeline文件路径及对应的rank_id/id、需要合并的process_list、校准时间差node_time_diff + 输出合并timeline + """ + new_events = [] + for rank_id, timeline_file_path in timeline_file_dict.items(): node = rank_id // 8 - print("rank id: ", rank_id, "timeline file: ", timeline_file) + print("rank id: ", rank_id, "timeline file: ", timeline_file_path) # 获取相应的时间差异 node_time = node_time_diff[node] if node_time_diff else None - - with open(timeline_file, 'r+') as f: - cur_events = json.load(f) - proc_pid_dict = {} - for event in cur_events: - if event.get("name") == "process_name" and event.get("ph") == "M": - if event.get("args"): - proc_pid_dict[event["args"].get("name")] = event.get("pid") - process_list = merge_items if merge_items else list(proc_pid_dict.keys()) - # 提取待合并的items的pid - merged_pids = set() - for pro in process_list: - pro = " ".join(pro.split("_")) if "_" in pro else pro - - if pro not in proc_pid_dict.keys(): - print(f"{pro} is invalid item, valid items: {list(proc_pid_dict.keys())}") - continue - merged_pids.add(proc_pid_dict.get(pro)) - - for event in cur_events: - - # 只合并特定数据项 - if event.get('pid') not in merged_pids: - continue - - # 当前节点间时间误差可用时,进行时间校准 - if event.get("ts") and node_time: - event["ts"] = event["ts"] - node_time * 1000000 - - # 区分不同rank的同一进程的pid - if isinstance(event.get("pid"), (str, int)): - event["pid"] = int(''.join(x for x in str(event.get("pid")) if x.isdigit()) + - str(rank_id * MAX_INDEX_COUNT)) - - # convert tid to int - if isinstance(event.get("tid"), str): - event["tid"] = int(''.join(x for x in event["tid"] if x.isdigit())) - - # 进程名加上rank_id区分不同rank - if event.get("name") == "process_name" and event.get("ph") == "M": - if event.get("args") is not None and event["args"].get("name") is not None: - event["args"]["name"] = event["args"]["name"] + f"_{rank_id}" - - new_events.append(event) - - output_path = os.path.join(args.output, f"msprof_merged_{len(rank_ids)}p.json") + try: + with open(timeline_file_path, 'r+') as f: + cur_events = json.load(f) + except Exception as err: + print("[ERROR] %s" % err) + return + + proc_pid_dict = {} + for event in cur_events: + if event.get("name") == "process_name" and event.get("ph") == "M": + if event.get("args"): + proc_pid_dict[event["args"].get("name")] = event.get("pid") + process_list = process_list if process_list else list(proc_pid_dict.keys()) + # 提取待合并的items的pid + merged_pids = set() + for pro in process_list: + pro = " ".join(pro.split("_")) if "_" in pro else pro + + if pro not in proc_pid_dict.keys(): + print(f"{pro} is invalid item, valid items: {list(proc_pid_dict.keys())}") + continue + merged_pids.add(proc_pid_dict.get(pro)) + + for event in cur_events: + + # 只合并特定数据项 + if merged_pids and event.get('pid') not in merged_pids: + continue + + # 当前节点间时间误差可用时,进行时间校准 + if event.get("ts") and node_time: + event["ts"] = event["ts"] - node_time * 1000000 + + # 区分不同rank的同一进程的pid + if isinstance(event.get("pid"), (str, int)): + # 合并GPU profiling/ after timeline pid modify + # event["pid"] = int(event["pid"]) if event["pid"].isdigit() else event["pid"] + str(rank_id) + event["pid"] = int(''.join(x for x in str(event.get("pid")) if x.isdigit()) + + str(rank_id * MAX_INDEX_COUNT)) + + # convert tid to int + if isinstance(event.get("tid"), str): + event["tid"] = int(''.join(x for x in event["tid"] if x.isdigit())) + + # 进程名加上rank_id区分不同rank + if event.get("name") == "process_name" and event.get("ph") == "M": + if event.get("args") is not None and event["args"].get("name") is not None: + event["args"]["name"] = event["args"]["name"] + f"_{rank_id}" + + new_events.append(event) + + output_path = os.path.join(args.output, f"msprof_merged_{len(timeline_file_dict)}p.json") with open(output_path, 'w') as f: json.dump(new_events, f) + print(f"timeline merged output path: {output_path}") def parse_args(): @@ -169,16 +205,13 @@ def parse_args(): parser.add_argument("--timediff", "-t", default=None, help="JSON file for saving startup time differences") parser.add_argument("--output", "-o", default=None, help="save path of msprof_merged.json ") parser.add_argument("--rank", default=None, help="List of ranks to be merged. By default, all ranks are merged") - parser.add_argument("--items", default=None, help="Items to be combined in the timeline." - " The options are ['MsprofTx', 'AscendCL', 'Runtime', 'AI CPU'," - "'Task Scheduler', and 'HCCL'].") + parser.add_argument("--items", default=None, help="Specify the data items to be merged. in the timeline.") + parser.add_argument("--custom", action="store_true", help="Customize the timeline file to be merged.") arg = parser.parse_args() return arg if __name__ == "__main__": - import time - start = time.time() args = parse_args() prof_dir = get_path_dir(args.data) @@ -186,6 +219,7 @@ if __name__ == "__main__": if not args.output: args.output = args.data print("========================== start merge timeline ====================") - merge_timeline(timeline_info, args) - end = time.time() - print(f"msprof.json merged finished cost time: {end - start}s") + if args.custom: + merge_timeline_custom(args) + else: + merge_timeline_general(timeline_info, args) \ No newline at end of file