From b6558b7705f0876960cae1b4f8ab3509f9916064 Mon Sep 17 00:00:00 2001 From: zhouyanming Date: Wed, 5 Jul 2023 17:21:24 +0800 Subject: [PATCH] add profiling analyse tool --- debug/tools/profiling_analyse/README.md | 53 +++++++++++++ debug/tools/profiling_analyse/__init__.py | 0 debug/tools/profiling_analyse/gpu_parser.py | 65 ++++++++++++++++ debug/tools/profiling_analyse/npu_parser.py | 71 ++++++++++++++++++ .../tools/profiling_analyse/parser_helper.py | 27 +++++++ .../profiling_analyse/profiling_parse.py | 75 +++++++++++++++++++ 6 files changed, 291 insertions(+) create mode 100644 debug/tools/profiling_analyse/README.md create mode 100644 debug/tools/profiling_analyse/__init__.py create mode 100644 debug/tools/profiling_analyse/gpu_parser.py create mode 100644 debug/tools/profiling_analyse/npu_parser.py create mode 100644 debug/tools/profiling_analyse/parser_helper.py create mode 100644 debug/tools/profiling_analyse/profiling_parse.py diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md new file mode 100644 index 000000000..4d2e62d2a --- /dev/null +++ b/debug/tools/profiling_analyse/README.md @@ -0,0 +1,53 @@ +# profiling_analyse + +## gpu性能数据拆解 +### 算子耗时 +1. 获取json文件中的traceEvents字段,获取所有cat为kernel的,且name中不包含nccl的event,将他们的耗时相加即为所有算子耗时all_time; +2. 取算子中name包含gemm的为cube算子耗时cube_time. +3. vector算子耗时即为(all_time - cube_time) +### 大kernel算子 +待补充大kernel列表 +### 通信 +此处的通信指通信未掩盖耗时,gpu上暂无明确的计算方法,故取的profiling展示图中的通信流耗时结果 +实际计算是取name中包含'ncclKernel_'的event,将他们的耗时相加 +### 计算流e2e耗时 +按照时间也就是'ts'字段排序所有events,可以看到最后的event是Record Window End,故使用最后一个event的ts值减去第一个event的ts值作为计算流e2e耗时 +### 调度 +由于gpu上目前没有较好的通信不可掩盖耗时算法,所以gpu上的调度耗时计算方法采用:调度耗时 = 计算流E2E耗时 - 计算流任务执行总耗时 +计算流为stream为7的流,实际计算取stream为7的event耗时相加; +计算流的stream不一定是7,后续会适配通过观察kernel算子分布来判断计算流的方法 +获得调度耗时后,使用调度占比 = 调度耗时/E2E耗时 获取调度占比 +### 内存分析 +gpu上的内存使用可以使用nvidia-smi查看,使用json文件分析时需要打开profile_memory=True开关 +获取运行稳定后的step的profiling数据,获取所有name值为[memory]的event,获取这类event中Total Reserved值的最大值作为内存值 + +## npu性能数据解析 +### 算子耗时 +1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。 +2、当前仅统计算子运行在vector和cube上的耗时。、 +3、这2中算子于csv文件中的的TaskType均为AI_CORE,其中aiv_vec_time时间多表明为vector算子,aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。 + +### 大kernel算子 +待补充大kernel算子列表 + +### 通信 +此处的通信为通信未掩盖耗时,对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的communication_not_overlapped。 +输出结果为该字段时间求和。 + +### 计算流e2e耗时 +此耗时通过统计trace_view.json中时间戳‘ts’的最小值和最大值,其时间差的绝对值即为e2e耗时。 + +### 调度占比 +1、调度占比的求取需先计算调度耗时,调度占比=调度耗时/e2e耗时 * 100%。 +2、调度耗时的计算方法有2种,①调度耗时=单步打屏时间-算子耗时-通信不可掩盖耗时,②调度耗时=e2e耗时-计算流执行任务总耗时。 +3、由于”单步打屏时间“需额外记录输入,暂不使用方法①,方法②中的计算流执行任务总耗时即为trace_view.json下的compute_time。 + +### 内存 +1、内存统计的数据来源于ASCEND_PROFILER_OUTPUT/memory_record.csv中的”Total Reserved(MB)“。 +2、其值在模型训练趋于稳定时逐渐固定,整体偏差不大,因此输出结果为该列数据的最大值。 + +## 样例 +- step1:下载数据:https://onebox.huawei.com/v/2ad3400460fac22fa61f21f478edd116 + +- 运行命令:python profiling_parse.py -g prof0704_best\gpu\gpu_trace_device0.json -n prof0704_best\Malluma_443350_20230704144255_ascend_pt +- 输出结果:可以得到gpu与npu对照的打屏性能拆解数据 diff --git a/debug/tools/profiling_analyse/__init__.py b/debug/tools/profiling_analyse/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py new file mode 100644 index 000000000..a269c1430 --- /dev/null +++ b/debug/tools/profiling_analyse/gpu_parser.py @@ -0,0 +1,65 @@ +import pandas as pd + +import parser_helper + + +class GpuProfilingParser: + def __init__(self, gpu_trace_file): + self.trace_events = self.read_profiling_json_file(gpu_trace_file) + self.profiling_info = parser_helper.ProfilingInfo() + + @staticmethod + def read_profiling_json_file(json_path): + data = parser_helper.read_json_file(json_path) + if 'traceEvents' not in data: + raise RuntimeError("The gpu profiling json doesn't contain traceEvents data.") + return data.get('traceEvents') + + def parse_events(self): + cube_time = 0.0 + all_op_time = 0.0 + communication_not_overlapped = 0.0 + op_list = [] + compute_stream_dur = 0.0 # 计算流耗时 + + for event in self.trace_events: + if not isinstance(event, dict): + continue + if event.get('args') and event.get('args').get('stream') == 7: + compute_stream_dur += float(event.get('dur')) + if not {'name', 'cat', 'dur', 'ts'} < event.keys(): + continue + name = event.get('name') + dur = event.get('dur') + if 'nccl' in name: + if 'ncclKernel_' in name: + communication_not_overlapped += float(dur) + continue + cat = event.get('cat') + if cat.lower() != 'kernel': + continue + if 'gemm' in name: + cube_time += float(dur) + all_op_time += float(dur) + op_list.append([event.get('ts'), name, cat, dur]) + op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur']) + op_dataframe.to_csv('gpu_perf.csv', index=False) + self.profiling_info.communication_not_overlapped = communication_not_overlapped / 10 ** 6 + self.profiling_info.cube_time = cube_time / 10 ** 6 + self.profiling_info.vector_time = (all_op_time - cube_time) / 10 ** 6 + self.parse_e2e_time() + self.profiling_info.scheduling_time = self.profiling_info.e2e_time - compute_stream_dur / 10 ** 6 + self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time + self.parse_memory_reserved() + + def parse_e2e_time(self): + timeline = sorted(self.trace_events, key=lambda event: event.get('ts')) + self.profiling_info.e2e_time = (timeline[-1].get('ts') - timeline[0].get('ts')) / 10 ** 6 + + def parse_memory_reserved(self): + memories = [event.get('args').get('Total Reserved') for event in self.trace_events if + event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0] + if not memories: + print("Gpu profiling data doesn't contain memory info") + return + self.profiling_info.memory_used = max(memories) / 1024 ** 3 diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py new file mode 100644 index 000000000..7e5949812 --- /dev/null +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -0,0 +1,71 @@ +import sys +import pandas as pd +import parser_helper + + +class NpuProfilingParser: + def __init__(self, npu_file_path): + self.npu_json_file = npu_file_path.get('trace_view') + self.npu_summary_file = npu_file_path.get('op_summary') + self.npu_mem_file = npu_file_path.get('memory_record') + self.profiling_info = parser_helper.ProfilingInfo() + + def parse_npu_json_events(self): + conn_time = 0.0 + compute_time = 0.0 + min_ts = sys.float_info.max + max_ts = sys.float_info.min + data = parser_helper.read_json_file(self.npu_json_file) + for dic in data: + if dic.get('name') == 'communication_not_overlapped': + conn_time += float(dic.get('dur')) + if dic.get('name') == 'compute_time': + compute_time += float(dic.get('dur')) + if dic.get('ts'): + ts = dic.get('ts') + min_ts = ts if ts < min_ts else min_ts + max_ts = ts if ts > max_ts else max_ts + self.profiling_info.e2e_time = (max_ts - min_ts) / 10 ** 6 + self.profiling_info.communication_not_overlapped = conn_time / 10 ** 6 + compute_time = compute_time / 10 ** 6 + self.profiling_info.scheduling_time = self.profiling_info.e2e_time - compute_time + self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time + + def parse_npu_csv_events(self): + # 读取csv文件 + info = pd.read_csv(self.npu_summary_file, index_col=None) + # 用一个字典保存各类算子的统计结果 + op_statics_result = {} + # cube和vector时间 + cube_time = 0.0 + vec_time = 0.0 + length = len(info['Model ID']) + if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: + raise ValueError('There is no cube time or vector time in the csv!The aic_mac_time(us) and ' + 'aiv_vec_time(us) are necessary for the determination of cube and vector.') + + for i in range(length): + op_type = info.loc[i, 'OP Type'] + task_type = info.loc[i, 'Task Type'] + task_durations = info.loc[i, 'Task Duration(us)'] + aic_mac_time = info.loc[i, 'aic_mac_time(us)'] + aiv_vec_time = info.loc[i, 'aiv_vec_time(us)'] + + if task_type in ['AI_CORE']: + if aiv_vec_time > aic_mac_time: + vec_time += task_durations + if op_statics_result.get(op_type) is None: + op_statics_result[op_type] = [task_durations, 'vector'] + else: + op_statics_result[op_type][0] += task_durations + else: + cube_time += task_durations + if op_statics_result.get(op_type) is None: + op_statics_result[op_type] = [task_durations, 'cube'] + else: + op_statics_result[op_type][0] += task_durations + + info = pd.read_csv(self.npu_mem_file, usecols=['Total Reserved(MB)'], index_col=None) + self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024 + self.profiling_info.cube_time = cube_time / 10 ** 6 + self.profiling_info.vector_time = vec_time / 10 ** 6 diff --git a/debug/tools/profiling_analyse/parser_helper.py b/debug/tools/profiling_analyse/parser_helper.py new file mode 100644 index 000000000..0b9a0e048 --- /dev/null +++ b/debug/tools/profiling_analyse/parser_helper.py @@ -0,0 +1,27 @@ +import json +import os + + +class ProfilingInfo: + def __init__(self): + self.cube_time = 0.0 + self.vector_time = 0.0 + self.large_kernel = 0.0 + self.communication_not_overlapped = 0.0 + self.scheduling_ratio = 0.0 + self.memory_used = 0.0 + self.e2e_time = 0.0 + self.scheduling_time = 0.0 + + def __setattr__(self, key, value): + if value < 0: + raise ValueError(f"The {key} value shouldn't be less than 0.") + super().__setattr__(key, value) + + +def read_json_file(path): + if not os.path.isfile(path): + raise ValueError(f'The path "{path}" is not a valid json file.') + with open(path, 'r', encoding='utf-8') as json_handler: + data = json.load(json_handler) + return data diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py new file mode 100644 index 000000000..0d2dde05a --- /dev/null +++ b/debug/tools/profiling_analyse/profiling_parse.py @@ -0,0 +1,75 @@ +import argparse +import os + +from prettytable import PrettyTable + +from gpu_parser import GpuProfilingParser +from npu_parser import NpuProfilingParser +from parser_helper import ProfilingInfo + + +def parse_command(): + parser = argparse.ArgumentParser() + parser.add_argument('-g', '--gpu', required=False, default='', metavar='(FILE)', help='Gpu profiling json file.') + parser.add_argument('-n', '--npu', required=False, default='', metavar='(FILE)', + help='Npu single core profiling root path.') + return parser.parse_args() + + +def show_table(gpu_profiling_info, npu_profiling_info): + table = PrettyTable() + table.title = '大模型性能拆解' + table.field_names = ['', 'cube算子', 'vector算子', '大kernel算子', '通信', '调度耗时', '调度占比', '内存', + 'E2E性能值'] + table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s', + f'{gpu_profiling_info.large_kernel:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s', + f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}', + f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s']) + table.add_row(['当前现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s', + f'{npu_profiling_info.large_kernel:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s', + f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}', + f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s']) + print(table) + + +def parse_gpu(args): + if args.gpu: + gpu_parser = GpuProfilingParser(args.gpu) + gpu_parser.parse_events() + return gpu_parser.profiling_info + print('Gpu trace json file is not specified.') + return ProfilingInfo() + + +def parse_npu(npu_path): + if not npu_path.get('trace_view'): + print('Npu trace json file is not available.') + return ProfilingInfo() + if not npu_path.get('op_summary'): + print('Npu op summary csv file is not available.') + return ProfilingInfo() + if not npu_path.get('memory_record'): + print('Npu op memory csv file is not available.') + return ProfilingInfo() + npu_parser = NpuProfilingParser(npu_path) + npu_parser.parse_npu_json_events() + npu_parser.parse_npu_csv_events() + return npu_parser.profiling_info + + +def main(): + args = parse_command() + npu_path = {'trace_view': None, 'memory_record': None, 'op_summary': None} + for root, _, files in os.walk(args.npu): + for file in files: + if file == 'trace_view.json': + npu_path['trace_view'] = os.path.join(root, file) + if file == 'memory_record.csv': + npu_path['memory_record'] = os.path.join(root, file) + if 'op_summary' in file: + npu_path['op_summary'] = os.path.join(root, file) + show_table(parse_gpu(args), parse_npu(npu_path)) + + +if __name__ == '__main__': + main() -- Gitee