From b6558b7705f0876960cae1b4f8ab3509f9916064 Mon Sep 17 00:00:00 2001
From: zhouyanming <zhouyanming2@huawei.com>
Date: Wed, 5 Jul 2023 17:21:24 +0800
Subject: [PATCH] add profiling analyse tool

---
 debug/tools/profiling_analyse/README.md       | 53 +++++++++++++
 debug/tools/profiling_analyse/__init__.py     |  0
 debug/tools/profiling_analyse/gpu_parser.py   | 65 ++++++++++++++++
 debug/tools/profiling_analyse/npu_parser.py   | 71 ++++++++++++++++++
 .../tools/profiling_analyse/parser_helper.py  | 27 +++++++
 .../profiling_analyse/profiling_parse.py      | 75 +++++++++++++++++++
 6 files changed, 291 insertions(+)
 create mode 100644 debug/tools/profiling_analyse/README.md
 create mode 100644 debug/tools/profiling_analyse/__init__.py
 create mode 100644 debug/tools/profiling_analyse/gpu_parser.py
 create mode 100644 debug/tools/profiling_analyse/npu_parser.py
 create mode 100644 debug/tools/profiling_analyse/parser_helper.py
 create mode 100644 debug/tools/profiling_analyse/profiling_parse.py

diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md
new file mode 100644
index 000000000..4d2e62d2a
--- /dev/null
+++ b/debug/tools/profiling_analyse/README.md
@@ -0,0 +1,53 @@
+# profiling_analyse
+
+## gpu性能数据拆解
+### 算子耗时
+1. 获取json文件中的traceEvents字段，获取所有cat为kernel的，且name中不包含nccl的event，将他们的耗时相加即为所有算子耗时all_time；
+2. 取算子中name包含gemm的为cube算子耗时cube_time.
+3. vector算子耗时即为(all_time - cube_time)
+### 大kernel算子
+待补充大kernel列表
+### 通信
+此处的通信指通信未掩盖耗时，gpu上暂无明确的计算方法，故取的profiling展示图中的通信流耗时结果
+实际计算是取name中包含'ncclKernel_'的event，将他们的耗时相加
+### 计算流e2e耗时
+按照时间也就是'ts'字段排序所有events，可以看到最后的event是Record Window End,故使用最后一个event的ts值减去第一个event的ts值作为计算流e2e耗时
+### 调度
+由于gpu上目前没有较好的通信不可掩盖耗时算法，所以gpu上的调度耗时计算方法采用：调度耗时 = 计算流E2E耗时 - 计算流任务执行总耗时
+计算流为stream为7的流，实际计算取stream为7的event耗时相加；
+计算流的stream不一定是7，后续会适配通过观察kernel算子分布来判断计算流的方法
+获得调度耗时后，使用调度占比 = 调度耗时/E2E耗时 获取调度占比
+### 内存分析
+gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需要打开profile_memory=True开关
+获取运行稳定后的step的profiling数据，获取所有name值为[memory]的event，获取这类event中Total Reserved值的最大值作为内存值
+
+## npu性能数据解析
+### 算子耗时
+1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。
+2、当前仅统计算子运行在vector和cube上的耗时。、
+3、这2中算子于csv文件中的的TaskType均为AI_CORE，其中aiv_vec_time时间多表明为vector算子，aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。
+
+### 大kernel算子
+待补充大kernel算子列表
+
+### 通信
+此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的communication_not_overlapped。
+输出结果为该字段时间求和。
+
+### 计算流e2e耗时
+此耗时通过统计trace_view.json中时间戳‘ts’的最小值和最大值，其时间差的绝对值即为e2e耗时。
+
+### 调度占比
+1、调度占比的求取需先计算调度耗时，调度占比=调度耗时/e2e耗时 * 100%。
+2、调度耗时的计算方法有2种，①调度耗时=单步打屏时间-算子耗时-通信不可掩盖耗时，②调度耗时=e2e耗时-计算流执行任务总耗时。
+3、由于”单步打屏时间“需额外记录输入，暂不使用方法①，方法②中的计算流执行任务总耗时即为trace_view.json下的compute_time。
+
+### 内存
+1、内存统计的数据来源于ASCEND_PROFILER_OUTPUT/memory_record.csv中的”Total Reserved(MB)“。
+2、其值在模型训练趋于稳定时逐渐固定，整体偏差不大，因此输出结果为该列数据的最大值。
+
+## 样例
+- step1:下载数据：https://onebox.huawei.com/v/2ad3400460fac22fa61f21f478edd116
+
+- 运行命令:python profiling_parse.py -g prof0704_best\gpu\gpu_trace_device0.json -n prof0704_best\Malluma_443350_20230704144255_ascend_pt
+- 输出结果：可以得到gpu与npu对照的打屏性能拆解数据
diff --git a/debug/tools/profiling_analyse/__init__.py b/debug/tools/profiling_analyse/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py
new file mode 100644
index 000000000..a269c1430
--- /dev/null
+++ b/debug/tools/profiling_analyse/gpu_parser.py
@@ -0,0 +1,65 @@
+import pandas as pd
+
+import parser_helper
+
+
+class GpuProfilingParser:
+    def __init__(self, gpu_trace_file):
+        self.trace_events = self.read_profiling_json_file(gpu_trace_file)
+        self.profiling_info = parser_helper.ProfilingInfo()
+
+    @staticmethod
+    def read_profiling_json_file(json_path):
+        data = parser_helper.read_json_file(json_path)
+        if 'traceEvents' not in data:
+            raise RuntimeError("The gpu profiling json doesn't contain traceEvents data.")
+        return data.get('traceEvents')
+
+    def parse_events(self):
+        cube_time = 0.0
+        all_op_time = 0.0
+        communication_not_overlapped = 0.0
+        op_list = []
+        compute_stream_dur = 0.0  # 计算流耗时
+
+        for event in self.trace_events:
+            if not isinstance(event, dict):
+                continue
+            if event.get('args') and event.get('args').get('stream') == 7:
+                compute_stream_dur += float(event.get('dur'))
+            if not {'name', 'cat', 'dur', 'ts'} < event.keys():
+                continue
+            name = event.get('name')
+            dur = event.get('dur')
+            if 'nccl' in name:
+                if 'ncclKernel_' in name:
+                    communication_not_overlapped += float(dur)
+                continue
+            cat = event.get('cat')
+            if cat.lower() != 'kernel':
+                continue
+            if 'gemm' in name:
+                cube_time += float(dur)
+            all_op_time += float(dur)
+            op_list.append([event.get('ts'), name, cat, dur])
+        op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur'])
+        op_dataframe.to_csv('gpu_perf.csv', index=False)
+        self.profiling_info.communication_not_overlapped = communication_not_overlapped / 10 ** 6
+        self.profiling_info.cube_time = cube_time / 10 ** 6
+        self.profiling_info.vector_time = (all_op_time - cube_time) / 10 ** 6
+        self.parse_e2e_time()
+        self.profiling_info.scheduling_time = self.profiling_info.e2e_time - compute_stream_dur / 10 ** 6
+        self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time
+        self.parse_memory_reserved()
+
+    def parse_e2e_time(self):
+        timeline = sorted(self.trace_events, key=lambda event: event.get('ts'))
+        self.profiling_info.e2e_time = (timeline[-1].get('ts') - timeline[0].get('ts')) / 10 ** 6
+
+    def parse_memory_reserved(self):
+        memories = [event.get('args').get('Total Reserved') for event in self.trace_events if
+                    event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0]
+        if not memories:
+            print("Gpu profiling data doesn't contain memory info")
+            return
+        self.profiling_info.memory_used = max(memories) / 1024 ** 3
diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
new file mode 100644
index 000000000..7e5949812
--- /dev/null
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -0,0 +1,71 @@
+import sys
+import pandas as pd
+import parser_helper
+
+
+class NpuProfilingParser:
+    def __init__(self, npu_file_path):
+        self.npu_json_file = npu_file_path.get('trace_view')
+        self.npu_summary_file = npu_file_path.get('op_summary')
+        self.npu_mem_file = npu_file_path.get('memory_record')
+        self.profiling_info = parser_helper.ProfilingInfo()
+
+    def parse_npu_json_events(self):
+        conn_time = 0.0
+        compute_time = 0.0
+        min_ts = sys.float_info.max
+        max_ts = sys.float_info.min
+        data = parser_helper.read_json_file(self.npu_json_file)
+        for dic in data:
+            if dic.get('name') == 'communication_not_overlapped':
+                conn_time += float(dic.get('dur'))
+            if dic.get('name') == 'compute_time':
+                compute_time += float(dic.get('dur'))
+            if dic.get('ts'):
+                ts = dic.get('ts')
+                min_ts = ts if ts < min_ts else min_ts
+                max_ts = ts if ts > max_ts else max_ts
+        self.profiling_info.e2e_time = (max_ts - min_ts) / 10 ** 6
+        self.profiling_info.communication_not_overlapped = conn_time / 10 ** 6
+        compute_time = compute_time / 10 ** 6
+        self.profiling_info.scheduling_time = self.profiling_info.e2e_time - compute_time
+        self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time
+
+    def parse_npu_csv_events(self):
+        # 读取csv文件
+        info = pd.read_csv(self.npu_summary_file, index_col=None)
+        # 用一个字典保存各类算子的统计结果
+        op_statics_result = {}
+        # cube和vector时间
+        cube_time = 0.0
+        vec_time = 0.0
+        length = len(info['Model ID'])
+        if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None:
+            raise ValueError('There is no cube time or vector time in the csv!The aic_mac_time(us) and '
+                             'aiv_vec_time(us) are necessary for the determination of cube and vector.')
+
+        for i in range(length):
+            op_type = info.loc[i, 'OP Type']
+            task_type = info.loc[i, 'Task Type']
+            task_durations = info.loc[i, 'Task Duration(us)']
+            aic_mac_time = info.loc[i, 'aic_mac_time(us)']
+            aiv_vec_time = info.loc[i, 'aiv_vec_time(us)']
+
+            if task_type in ['AI_CORE']:
+                if aiv_vec_time > aic_mac_time:
+                    vec_time += task_durations
+                    if op_statics_result.get(op_type) is None:
+                        op_statics_result[op_type] = [task_durations, 'vector']
+                    else:
+                        op_statics_result[op_type][0] += task_durations
+                else:
+                    cube_time += task_durations
+                    if op_statics_result.get(op_type) is None:
+                        op_statics_result[op_type] = [task_durations, 'cube']
+                    else:
+                        op_statics_result[op_type][0] += task_durations
+
+        info = pd.read_csv(self.npu_mem_file, usecols=['Total Reserved(MB)'], index_col=None)
+        self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024
+        self.profiling_info.cube_time = cube_time / 10 ** 6
+        self.profiling_info.vector_time = vec_time / 10 ** 6
diff --git a/debug/tools/profiling_analyse/parser_helper.py b/debug/tools/profiling_analyse/parser_helper.py
new file mode 100644
index 000000000..0b9a0e048
--- /dev/null
+++ b/debug/tools/profiling_analyse/parser_helper.py
@@ -0,0 +1,27 @@
+import json
+import os
+
+
+class ProfilingInfo:
+    def __init__(self):
+        self.cube_time = 0.0
+        self.vector_time = 0.0
+        self.large_kernel = 0.0
+        self.communication_not_overlapped = 0.0
+        self.scheduling_ratio = 0.0
+        self.memory_used = 0.0
+        self.e2e_time = 0.0
+        self.scheduling_time = 0.0
+
+    def __setattr__(self, key, value):
+        if value < 0:
+            raise ValueError(f"The {key} value shouldn't be less than 0.")
+        super().__setattr__(key, value)
+
+
+def read_json_file(path):
+    if not os.path.isfile(path):
+        raise ValueError(f'The path "{path}" is not a valid json file.')
+    with open(path, 'r', encoding='utf-8') as json_handler:
+        data = json.load(json_handler)
+    return data
diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py
new file mode 100644
index 000000000..0d2dde05a
--- /dev/null
+++ b/debug/tools/profiling_analyse/profiling_parse.py
@@ -0,0 +1,75 @@
+import argparse
+import os
+
+from prettytable import PrettyTable
+
+from gpu_parser import GpuProfilingParser
+from npu_parser import NpuProfilingParser
+from parser_helper import ProfilingInfo
+
+
+def parse_command():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-g', '--gpu', required=False, default='', metavar='(FILE)', help='Gpu profiling json file.')
+    parser.add_argument('-n', '--npu', required=False, default='', metavar='(FILE)',
+                        help='Npu single core profiling root path.')
+    return parser.parse_args()
+
+
+def show_table(gpu_profiling_info, npu_profiling_info):
+    table = PrettyTable()
+    table.title = '大模型性能拆解'
+    table.field_names = ['', 'cube算子', 'vector算子', '大kernel算子', '通信', '调度耗时', '调度占比', '内存',
+                         'E2E性能值']
+    table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s',
+                  f'{gpu_profiling_info.large_kernel:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s',
+                  f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}',
+                  f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s'])
+    table.add_row(['当前现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s',
+                  f'{npu_profiling_info.large_kernel:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s',
+                  f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}',
+                  f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s'])
+    print(table)
+
+
+def parse_gpu(args):
+    if args.gpu:
+        gpu_parser = GpuProfilingParser(args.gpu)
+        gpu_parser.parse_events()
+        return gpu_parser.profiling_info
+    print('Gpu trace json file is not specified.')
+    return ProfilingInfo()
+
+
+def parse_npu(npu_path):
+    if not npu_path.get('trace_view'):
+        print('Npu trace json file is not available.')
+        return ProfilingInfo()
+    if not npu_path.get('op_summary'):
+        print('Npu op summary csv file is not available.')
+        return ProfilingInfo()
+    if not npu_path.get('memory_record'):
+        print('Npu op memory csv file is not available.')
+        return ProfilingInfo()
+    npu_parser = NpuProfilingParser(npu_path)
+    npu_parser.parse_npu_json_events()
+    npu_parser.parse_npu_csv_events()
+    return npu_parser.profiling_info
+
+
+def main():
+    args = parse_command()
+    npu_path = {'trace_view': None, 'memory_record': None, 'op_summary': None}
+    for root, _, files in os.walk(args.npu):
+        for file in files:
+            if file == 'trace_view.json':
+                npu_path['trace_view'] = os.path.join(root, file)
+            if file == 'memory_record.csv':
+                npu_path['memory_record'] = os.path.join(root, file)
+            if 'op_summary' in file:
+                npu_path['op_summary'] = os.path.join(root, file)
+    show_table(parse_gpu(args), parse_npu(npu_path))
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee