From 8451956c9108ed3b52bdbdebd0b214de301013df Mon Sep 17 00:00:00 2001
From: sunboquan <sunboquan@huawei.com>
Date: Tue, 15 Aug 2023 23:19:36 +0800
Subject: [PATCH] compare entry

---
 profiler/compare_tools/performance_compare.py |  21 +++
 .../profiling_analysis/__init__.py            |  14 ++
 .../profiling_analysis/gpu_parser.py          | 110 ++++++++++++
 .../profiling_analysis/npu_parser.py          | 165 ++++++++++++++++++
 .../profiling_analysis/parser_helper.py       |  37 ++++
 .../profiling_analysis/profiling_parse.py     |  87 +++++++++
 6 files changed, 434 insertions(+)
 create mode 100644 profiler/compare_tools/profiling_analysis/__init__.py
 create mode 100644 profiler/compare_tools/profiling_analysis/gpu_parser.py
 create mode 100644 profiler/compare_tools/profiling_analysis/npu_parser.py
 create mode 100644 profiler/compare_tools/profiling_analysis/parser_helper.py
 create mode 100644 profiler/compare_tools/profiling_analysis/profiling_parse.py

diff --git a/profiler/compare_tools/performance_compare.py b/profiler/compare_tools/performance_compare.py
index e2c47ef6c5..46f7160341 100644
--- a/profiler/compare_tools/performance_compare.py
+++ b/profiler/compare_tools/performance_compare.py
@@ -7,6 +7,24 @@ import time
 
 from generation.comparison_generator import ComparisonGenerator
 from utils.args_manager import ArgsManager
+from profiling_analysis.profiling_parse import prof_main
+from utils.constant import Constant
+
+
+def performance_compare(args):
+    if args.disable_profiling_compare:
+        return
+    npu_path = ''
+    gpu_path = ''
+    if ArgsManager().base_profiling_type == Constant.NPU:
+        npu_path = ArgsManager().base_profiling.file_path
+    elif ArgsManager().base_profiling_type == Constant.GPU:
+        npu_path = ArgsManager().base_profiling.file_path
+    if ArgsManager().comparison_profiling_type == Constant.NPU:
+        gpu_path = ArgsManager().comparison_profiling.file_path
+    elif ArgsManager().comparison_profiling_type == Constant.GPU:
+        gpu_path = ArgsManager().comparison_profiling.file_path
+    prof_main(npu_path, gpu_path)
 
 
 def main():
@@ -14,6 +32,8 @@ def main():
     parser = argparse.ArgumentParser(description="Compare trace of GPU and NPU")
     parser.add_argument("base_profiling_path", type=str, default='', help="base profiling file path")
     parser.add_argument("comparison_profiling_path", type=str, default='', help="comparison profiling file path")
+    parser.add_argument("--disable_profiling_compare", default=False, action='store_true',
+                        help="不进行GPU与NPU的性能拆解")
     parser.add_argument("--disable_operator_compare", default=False, action='store_true',
                         help="do not compare operator execution time")
     parser.add_argument("--disable_memory_compare", default=False, action='store_true',
@@ -29,6 +49,7 @@ def main():
     args = parser.parse_args()
 
     ArgsManager().init(args)
+    performance_compare(args)
     dir_path = args.output_path if args.output_path else "./"
     file_name = "performance_comparison_result_{}.xlsx".format(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
     result_file_path = os.path.join(dir_path, file_name)
diff --git a/profiler/compare_tools/profiling_analysis/__init__.py b/profiler/compare_tools/profiling_analysis/__init__.py
new file mode 100644
index 0000000000..8400fd5ecd
--- /dev/null
+++ b/profiler/compare_tools/profiling_analysis/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py
new file mode 100644
index 0000000000..61cd4f41bf
--- /dev/null
+++ b/profiler/compare_tools/profiling_analysis/gpu_parser.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import Counter, defaultdict
+import pandas as pd
+
+import profiling_analysis.parser_helper as parser_helper
+
+
+class GpuProfilingParser:
+    def __init__(self, gpu_path):
+        self.trace_events = self.read_profiling_json_file(gpu_path)
+        self.compute_stream_id = self.infer_compute_stream_id()
+        self.one_step_time = 0
+        self.profiling_info = parser_helper.ProfilingInfo()
+
+    @staticmethod
+    def read_profiling_json_file(json_path):
+        data = parser_helper.read_json_file(json_path)
+        if 'traceEvents' not in data:
+            raise RuntimeError("The gpu profiling json doesn't contain traceEvents data.")
+        return data.get('traceEvents')
+
+    def parse_events(self):
+        cube_time = 0.0
+        all_op_time = 0.0
+        op_list = []
+        compute_stream_dur = 0.0  # 计算流耗时
+        marks = defaultdict(int)  # mark for compute communication_not_overlapped time
+
+        for event in self.trace_events:
+            if not isinstance(event, dict):
+                continue
+            if event.get('args') and event.get('args').get('stream') == self.compute_stream_id:
+                compute_stream_dur += float(event.get('dur'))
+            if not {'name', 'cat', 'dur', 'ts'} < event.keys():
+                continue
+            name = event.get('name')
+            dur = event.get('dur')
+            ts = event.get('ts')
+            cat = event.get('cat')
+            if cat.lower() != 'kernel':
+                continue
+            if 'nccl' in name:
+                for timestep in range(ts + 1, ts + dur + 1):
+                    marks[str(timestep)] += 1  # mark this timestep in communication stream
+                continue
+            else:
+                for timestep in range(ts + 1, ts + dur + 1):
+                    marks[str(timestep)] += -100  # mark this timestep in compute stream
+            if 'gemm' in name:
+                cube_time += float(dur)
+            all_op_time += float(dur)
+            op_list.append([ts, name, cat, dur])
+        op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur'])
+        op_dataframe.to_csv('gpu_perf.csv', index=False)
+        self.profiling_info.compute_time = compute_stream_dur / 10 ** 6
+        self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6
+        self.profiling_info.cube_time = cube_time / 10 ** 6
+        self.profiling_info.vector_time = (all_op_time - cube_time) / 10 ** 6
+        self.parse_e2e_time()
+        if self.one_step_time:
+            self.profiling_info.scheduling_time = self.one_step_time - all_op_time / 10 ** 6 - \
+                                                  self.profiling_info.communication_not_overlapped
+        else:
+            self.profiling_info.scheduling_time = self.profiling_info.e2e_time - all_op_time / 10 ** 6 - \
+                                                  self.profiling_info.communication_not_overlapped
+        self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time
+        self.parse_memory_reserved()
+
+    def parse_e2e_time(self):
+        compute_events_timeline = [event for event in self.trace_events if
+                                   event.get('args') and event.get('args').get('stream') == self.compute_stream_id]
+        compute_events_timeline = sorted(compute_events_timeline, key=lambda event: event.get('ts'))
+        self.profiling_info.e2e_time = (compute_events_timeline[-1].get('ts') + compute_events_timeline[-1].get('dur') -
+                                        compute_events_timeline[0].get('ts')) / 10 ** 6
+
+    def parse_memory_reserved(self):
+        memories = [
+            event.get('args').get('Total Reserved') for event in self.trace_events
+            if event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0
+        ]
+        if not memories:
+            print("Gpu profiling data doesn't contain memory info")
+            return
+        self.profiling_info.memory_used = max(memories) / 1024 ** 3
+
+    def infer_compute_stream_id(self):
+        kernel_stream_ids = []
+        for event in self.trace_events:
+            is_kernel_exec_event = event.get('cat') == 'Kernel' and 'nccl' not in event.get('name')
+            has_stream_id_event = event.get('args') and event.get('args').get('stream')
+            if is_kernel_exec_event and has_stream_id_event:
+                kernel_stream_ids.append(event.get('args').get('stream'))
+        if not kernel_stream_ids:
+            raise RuntimeError('The profiling data does not contain kernel running data.')
+        counter = Counter(kernel_stream_ids)
+        return counter.most_common(1)[0][0]
diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py
new file mode 100644
index 0000000000..bc4d21145f
--- /dev/null
+++ b/profiler/compare_tools/profiling_analysis/npu_parser.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import pandas as pd
+from collections import defaultdict
+import profiling_analysis.parser_helper as parser_helper
+
+
+class NpuProfilingParser:
+    def __init__(self, npu_step_time, add_cube_name, npu_file_path):
+        self.npu_json_file = npu_file_path.get('trace_view')
+        self.npu_summary_file = npu_file_path.get('op_summary')
+        self.npu_mem_file = npu_file_path.get('memory_record')
+        self.profiling_info = parser_helper.ProfilingInfo()
+        self.npu_step_time = npu_step_time
+        self.parallel_time = 0
+        self.aicore_time = 0
+        self.cube_op_type = ['MatMul', 'BatchMatMul']
+        self.cube_op_type = list(set(self.cube_op_type + add_cube_name))
+        self.min_aicore_ts = sys.float_info.max
+        self.max_aicore_ts = sys.float_info.min
+
+    def parse_npu_json_events(self):
+        if not self.npu_json_file:
+            print('Npu trace json file is not available.')
+            return
+        compute_time = 0
+        min_ts = sys.float_info.max
+        max_ts = sys.float_info.min
+        ts_flag = False  # 表明没有获取到compute time的耗时
+        data = parser_helper.read_json_file(self.npu_json_file)
+        event_wait_sqe = defaultdict(list)
+        ai_core_dict = defaultdict(list)
+        event_wait_sqe_res = defaultdict(float)
+        ai_core_res = defaultdict(float)
+        for dic in data:
+            self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res, ai_core_res)
+            if ('name' in dic) and (dic.get('name') == 'compute_time'):
+                ts_flag = True
+                ts = dic.get('ts')
+                dur = dic.get('dur')
+                compute_time += dur
+                min_ts = ts if ts < min_ts else min_ts
+                max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts
+        # AI_CORE和EVENT_WAIT_SQE共存为计算流
+        compute_stream = []
+        parallel_stream = []
+        # 不存在算子并行的情况
+        if len(ai_core_dict) == 1:
+            compute_stream.append(min(ai_core_dict.keys()))
+        elif len(ai_core_dict) == 2:  # 2个ai_core，存在并行流（当前最多2条算子计算流）
+            compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys())
+            parallel_stream = list(ai_core_dict.keys() - set(compute_stream))
+        else:
+            print('Npu trace json file lack of Stream info')
+            return
+        cs_event_wait_sqe_list = event_wait_sqe[compute_stream[0]]
+        if parallel_stream:
+            cs_ai_core_list = ai_core_dict[parallel_stream[0]]
+            sorted(cs_event_wait_sqe_list, key=lambda x: (x[0]))
+            sorted(cs_ai_core_list, key=lambda x: (x[0]))
+            self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list)
+        self.profiling_info.compute_time = compute_time / 10 ** 6 if ts_flag else ai_core_res[compute_stream[0]] / 10 ** 6
+        self.profiling_info.e2e_time = (max_ts - min_ts) / 10 ** 6 if ts_flag else (self.max_aicore_ts - self.min_aicore_ts) / 10 ** 6
+        self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - 
+            self.parallel_time) / 10 ** 6
+        time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \
+            self.profiling_info.communication_not_overlapped
+        if self.npu_step_time:
+            self.profiling_info.scheduling_time = self.npu_step_time - time_required
+        else:
+            self.profiling_info.scheduling_time = self.profiling_info.e2e_time - time_required
+        self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time \
+            if self.profiling_info.e2e_time != 0 else 0
+
+    def parse_npu_csv_events(self):
+        if not self.npu_summary_file:
+            print('Npu op summary csv file is not available.')
+            return
+        info = pd.read_csv(self.npu_summary_file, index_col=None)
+        cube_time = 0.0
+        vec_time = 0.0
+        ai_core_time = 0.0
+        vec_mac_flag = True  # True标记当前summary文件中存在pmu信息
+        if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None:
+            print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:')
+            print(self.cube_op_type)
+            vec_mac_flag = False
+        for i in range(len(info['Model ID'])):
+            task_type = info.loc[i, 'Task Type']
+            if task_type not in ['AI_CORE']:
+                continue
+            task_durations = info.loc[i, 'Task Duration(us)']
+            ai_core_time += task_durations
+            op_type = info.loc[i, 'OP Type']
+            if not vec_mac_flag:  # 如果是极简模式根据OP_Type计算完cube time后提前返回
+                cube_time += task_durations if op_type in self.cube_op_type else 0.0
+                continue
+            aiv_vec_time = info.loc[i, 'aiv_vec_time(us)']
+            if aiv_vec_time > 0:
+                vec_time += task_durations
+        
+        if vec_mac_flag:
+            cube_time = (ai_core_time - vec_time) / 10 ** 6
+            vec_time /= 10 ** 6
+        else:
+            vec_time = (ai_core_time - cube_time) / 10 ** 6
+            cube_time /= 10 ** 6
+        self.profiling_info.cube_time = cube_time
+        self.profiling_info.vector_time = vec_time
+        if not self.npu_mem_file:
+            print('Npu op memory csv file is not available.')
+            return
+        try:
+            info = pd.read_csv(self.npu_mem_file, usecols=['Total Reserved(MB)'], index_col=None)
+        except ValueError:
+            print('Npu profiling data does not contain memory info.')
+        else:
+            self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024
+
+    @staticmethod
+    def interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list):
+        ans = 0
+        i = 0
+        j = 0
+        while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list):
+            lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0])
+            hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1])
+            if lo <= hi:
+                ans += (hi - lo)
+            if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]:
+                i += 1
+            else:
+                j += 1
+        return ans
+
+    def get_ts_by_task_type(self, dic, event_wait_sqe, ai_core_dict, enent_wait_res, ai_core_res):
+        if not dic.get('args'):
+            return
+        args = dic.get('args')
+        if args.get('Stream Id'):
+            stream_id = args.get('Stream Id')
+            ts = dic.get('ts')
+            dur = dic.get('dur')
+            if args.get('Task Type') == 'EVENT_WAIT_SQE':
+                enent_wait_res[stream_id] += dur
+                event_wait_sqe[stream_id].append([ts, ts + dur])
+            elif args.get('Task Type') == 'AI_CORE':
+                self.min_aicore_ts = ts if ts < self.min_aicore_ts else self.min_aicore_ts
+                self.max_aicore_ts = (ts + dur) if (ts + dur) > self.max_aicore_ts else self.max_aicore_ts
+                ai_core_res[stream_id] += dur
+                ai_core_dict[stream_id].append([ts, ts + dur])
diff --git a/profiler/compare_tools/profiling_analysis/parser_helper.py b/profiler/compare_tools/profiling_analysis/parser_helper.py
new file mode 100644
index 0000000000..958a3146bb
--- /dev/null
+++ b/profiler/compare_tools/profiling_analysis/parser_helper.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+
+class ProfilingInfo:
+    def __init__(self):
+        self.cube_time = 0.0
+        self.vector_time = 0.0
+        self.compute_time = 0.0
+        self.communication_not_overlapped = 0.0
+        self.scheduling_ratio = 0.0
+        self.memory_used = 0.0
+        self.e2e_time = 0.0
+        self.scheduling_time = 0.0
+
+
+def read_json_file(path):
+    if not os.path.isfile(path):
+        raise ValueError(f'The path "{path}" is not a valid json file.')
+    with open(path, 'r', encoding='utf-8') as json_handler:
+        data = json.load(json_handler)
+    return data
diff --git a/profiler/compare_tools/profiling_analysis/profiling_parse.py b/profiler/compare_tools/profiling_analysis/profiling_parse.py
new file mode 100644
index 0000000000..10e3a6d305
--- /dev/null
+++ b/profiler/compare_tools/profiling_analysis/profiling_parse.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from prettytable import PrettyTable
+
+from profiling_analysis.gpu_parser import GpuProfilingParser
+from profiling_analysis.npu_parser import NpuProfilingParser
+from profiling_analysis.parser_helper import ProfilingInfo
+
+
+def parse_command():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-g', '--gpu', required=False, default='', metavar='(FILE)', help='Gpu profiling json file.')
+    parser.add_argument('-glt', '--gpu_log_time', required=False, default=0.0, type=float, help='Gpu one step time(s)')
+    parser.add_argument('-n', '--npu', required=False, default='', metavar='(FILE)',
+                        help='Npu single core profiling root path.')
+    parser.add_argument('-nlt', '--npu_log_time', required=False, default=0.0, metavar='(FILE)', type=float,
+                        help='Npu one step time(s).')
+    parser.add_argument('-aop', '--add_cube_op', required=False, default=[], nargs='*', help='add cube op name')
+    return parser.parse_args()
+
+
+def show_table(gpu_profiling_info, npu_profiling_info):
+    table = PrettyTable()
+    table.title = '大模型性能拆解'
+    table.field_names = ['', 'cube算子', 'vector算子', '计算流耗时', '通信', '调度耗时', '调度占比', '内存',
+                         'E2E性能值']
+    table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s',
+                  f'{gpu_profiling_info.compute_time:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s',
+                  f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}',
+                  f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s'])
+    table.add_row(['当前现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s',
+                  f'{npu_profiling_info.compute_time:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s',
+                  f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}',
+                  f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s'])
+    print(table)
+
+
+def parse_gpu(gpu_path):
+    if gpu_path:
+        gpu_parser = GpuProfilingParser(gpu_path)
+        gpu_parser.parse_events()
+        return gpu_parser.profiling_info
+    print('Gpu trace json file is not specified.')
+    return ProfilingInfo()
+
+
+def parse_npu(npu_path):
+    npu_parser = NpuProfilingParser(0, [], npu_path)
+    npu_parser.parse_npu_csv_events()
+    npu_parser.parse_npu_json_events()
+    return npu_parser.profiling_info
+
+
+def prof_main(npu_path, gpu_path):
+    if not npu_path or not gpu_path:
+        return
+
+    npu_dir = {'trace_view': None, 'memory_record': None, 'op_summary': None}
+    for root, _, files in os.walk(npu_path):
+        for file in files:
+            if file == 'trace_view.json':
+                npu_dir['trace_view'] = os.path.join(root, file)
+            if file == 'memory_record.csv':
+                npu_dir['memory_record'] = os.path.join(root, file)
+            if 'op_summary_' in file:
+                npu_dir['op_summary'] = os.path.join(root, file)
+    show_table(parse_gpu(gpu_path), parse_npu(npu_dir))
+
+
+if __name__ == '__main__':
+    prof_main()
-- 
Gitee