From 8451956c9108ed3b52bdbdebd0b214de301013df Mon Sep 17 00:00:00 2001 From: sunboquan Date: Tue, 15 Aug 2023 23:19:36 +0800 Subject: [PATCH] compare entry --- profiler/compare_tools/performance_compare.py | 21 +++ .../profiling_analysis/__init__.py | 14 ++ .../profiling_analysis/gpu_parser.py | 110 ++++++++++++ .../profiling_analysis/npu_parser.py | 165 ++++++++++++++++++ .../profiling_analysis/parser_helper.py | 37 ++++ .../profiling_analysis/profiling_parse.py | 87 +++++++++ 6 files changed, 434 insertions(+) create mode 100644 profiler/compare_tools/profiling_analysis/__init__.py create mode 100644 profiler/compare_tools/profiling_analysis/gpu_parser.py create mode 100644 profiler/compare_tools/profiling_analysis/npu_parser.py create mode 100644 profiler/compare_tools/profiling_analysis/parser_helper.py create mode 100644 profiler/compare_tools/profiling_analysis/profiling_parse.py diff --git a/profiler/compare_tools/performance_compare.py b/profiler/compare_tools/performance_compare.py index e2c47ef6c5..46f7160341 100644 --- a/profiler/compare_tools/performance_compare.py +++ b/profiler/compare_tools/performance_compare.py @@ -7,6 +7,24 @@ import time from generation.comparison_generator import ComparisonGenerator from utils.args_manager import ArgsManager +from profiling_analysis.profiling_parse import prof_main +from utils.constant import Constant + + +def performance_compare(args): + if args.disable_profiling_compare: + return + npu_path = '' + gpu_path = '' + if ArgsManager().base_profiling_type == Constant.NPU: + npu_path = ArgsManager().base_profiling.file_path + elif ArgsManager().base_profiling_type == Constant.GPU: + npu_path = ArgsManager().base_profiling.file_path + if ArgsManager().comparison_profiling_type == Constant.NPU: + gpu_path = ArgsManager().comparison_profiling.file_path + elif ArgsManager().comparison_profiling_type == Constant.GPU: + gpu_path = ArgsManager().comparison_profiling.file_path + prof_main(npu_path, gpu_path) def main(): @@ -14,6 +32,8 @@ def main(): parser = argparse.ArgumentParser(description="Compare trace of GPU and NPU") parser.add_argument("base_profiling_path", type=str, default='', help="base profiling file path") parser.add_argument("comparison_profiling_path", type=str, default='', help="comparison profiling file path") + parser.add_argument("--disable_profiling_compare", default=False, action='store_true', + help="不进行GPU与NPU的性能拆解") parser.add_argument("--disable_operator_compare", default=False, action='store_true', help="do not compare operator execution time") parser.add_argument("--disable_memory_compare", default=False, action='store_true', @@ -29,6 +49,7 @@ def main(): args = parser.parse_args() ArgsManager().init(args) + performance_compare(args) dir_path = args.output_path if args.output_path else "./" file_name = "performance_comparison_result_{}.xlsx".format(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) result_file_path = os.path.join(dir_path, file_name) diff --git a/profiler/compare_tools/profiling_analysis/__init__.py b/profiler/compare_tools/profiling_analysis/__init__.py new file mode 100644 index 0000000000..8400fd5ecd --- /dev/null +++ b/profiler/compare_tools/profiling_analysis/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py new file mode 100644 index 0000000000..61cd4f41bf --- /dev/null +++ b/profiler/compare_tools/profiling_analysis/gpu_parser.py @@ -0,0 +1,110 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import Counter, defaultdict +import pandas as pd + +import profiling_analysis.parser_helper as parser_helper + + +class GpuProfilingParser: + def __init__(self, gpu_path): + self.trace_events = self.read_profiling_json_file(gpu_path) + self.compute_stream_id = self.infer_compute_stream_id() + self.one_step_time = 0 + self.profiling_info = parser_helper.ProfilingInfo() + + @staticmethod + def read_profiling_json_file(json_path): + data = parser_helper.read_json_file(json_path) + if 'traceEvents' not in data: + raise RuntimeError("The gpu profiling json doesn't contain traceEvents data.") + return data.get('traceEvents') + + def parse_events(self): + cube_time = 0.0 + all_op_time = 0.0 + op_list = [] + compute_stream_dur = 0.0 # 计算流耗时 + marks = defaultdict(int) # mark for compute communication_not_overlapped time + + for event in self.trace_events: + if not isinstance(event, dict): + continue + if event.get('args') and event.get('args').get('stream') == self.compute_stream_id: + compute_stream_dur += float(event.get('dur')) + if not {'name', 'cat', 'dur', 'ts'} < event.keys(): + continue + name = event.get('name') + dur = event.get('dur') + ts = event.get('ts') + cat = event.get('cat') + if cat.lower() != 'kernel': + continue + if 'nccl' in name: + for timestep in range(ts + 1, ts + dur + 1): + marks[str(timestep)] += 1 # mark this timestep in communication stream + continue + else: + for timestep in range(ts + 1, ts + dur + 1): + marks[str(timestep)] += -100 # mark this timestep in compute stream + if 'gemm' in name: + cube_time += float(dur) + all_op_time += float(dur) + op_list.append([ts, name, cat, dur]) + op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur']) + op_dataframe.to_csv('gpu_perf.csv', index=False) + self.profiling_info.compute_time = compute_stream_dur / 10 ** 6 + self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6 + self.profiling_info.cube_time = cube_time / 10 ** 6 + self.profiling_info.vector_time = (all_op_time - cube_time) / 10 ** 6 + self.parse_e2e_time() + if self.one_step_time: + self.profiling_info.scheduling_time = self.one_step_time - all_op_time / 10 ** 6 - \ + self.profiling_info.communication_not_overlapped + else: + self.profiling_info.scheduling_time = self.profiling_info.e2e_time - all_op_time / 10 ** 6 - \ + self.profiling_info.communication_not_overlapped + self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time + self.parse_memory_reserved() + + def parse_e2e_time(self): + compute_events_timeline = [event for event in self.trace_events if + event.get('args') and event.get('args').get('stream') == self.compute_stream_id] + compute_events_timeline = sorted(compute_events_timeline, key=lambda event: event.get('ts')) + self.profiling_info.e2e_time = (compute_events_timeline[-1].get('ts') + compute_events_timeline[-1].get('dur') - + compute_events_timeline[0].get('ts')) / 10 ** 6 + + def parse_memory_reserved(self): + memories = [ + event.get('args').get('Total Reserved') for event in self.trace_events + if event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0 + ] + if not memories: + print("Gpu profiling data doesn't contain memory info") + return + self.profiling_info.memory_used = max(memories) / 1024 ** 3 + + def infer_compute_stream_id(self): + kernel_stream_ids = [] + for event in self.trace_events: + is_kernel_exec_event = event.get('cat') == 'Kernel' and 'nccl' not in event.get('name') + has_stream_id_event = event.get('args') and event.get('args').get('stream') + if is_kernel_exec_event and has_stream_id_event: + kernel_stream_ids.append(event.get('args').get('stream')) + if not kernel_stream_ids: + raise RuntimeError('The profiling data does not contain kernel running data.') + counter = Counter(kernel_stream_ids) + return counter.most_common(1)[0][0] diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py new file mode 100644 index 0000000000..bc4d21145f --- /dev/null +++ b/profiler/compare_tools/profiling_analysis/npu_parser.py @@ -0,0 +1,165 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import pandas as pd +from collections import defaultdict +import profiling_analysis.parser_helper as parser_helper + + +class NpuProfilingParser: + def __init__(self, npu_step_time, add_cube_name, npu_file_path): + self.npu_json_file = npu_file_path.get('trace_view') + self.npu_summary_file = npu_file_path.get('op_summary') + self.npu_mem_file = npu_file_path.get('memory_record') + self.profiling_info = parser_helper.ProfilingInfo() + self.npu_step_time = npu_step_time + self.parallel_time = 0 + self.aicore_time = 0 + self.cube_op_type = ['MatMul', 'BatchMatMul'] + self.cube_op_type = list(set(self.cube_op_type + add_cube_name)) + self.min_aicore_ts = sys.float_info.max + self.max_aicore_ts = sys.float_info.min + + def parse_npu_json_events(self): + if not self.npu_json_file: + print('Npu trace json file is not available.') + return + compute_time = 0 + min_ts = sys.float_info.max + max_ts = sys.float_info.min + ts_flag = False # 表明没有获取到compute time的耗时 + data = parser_helper.read_json_file(self.npu_json_file) + event_wait_sqe = defaultdict(list) + ai_core_dict = defaultdict(list) + event_wait_sqe_res = defaultdict(float) + ai_core_res = defaultdict(float) + for dic in data: + self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res, ai_core_res) + if ('name' in dic) and (dic.get('name') == 'compute_time'): + ts_flag = True + ts = dic.get('ts') + dur = dic.get('dur') + compute_time += dur + min_ts = ts if ts < min_ts else min_ts + max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts + # AI_CORE和EVENT_WAIT_SQE共存为计算流 + compute_stream = [] + parallel_stream = [] + # 不存在算子并行的情况 + if len(ai_core_dict) == 1: + compute_stream.append(min(ai_core_dict.keys())) + elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) + compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) + parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) + else: + print('Npu trace json file lack of Stream info') + return + cs_event_wait_sqe_list = event_wait_sqe[compute_stream[0]] + if parallel_stream: + cs_ai_core_list = ai_core_dict[parallel_stream[0]] + sorted(cs_event_wait_sqe_list, key=lambda x: (x[0])) + sorted(cs_ai_core_list, key=lambda x: (x[0])) + self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) + self.profiling_info.compute_time = compute_time / 10 ** 6 if ts_flag else ai_core_res[compute_stream[0]] / 10 ** 6 + self.profiling_info.e2e_time = (max_ts - min_ts) / 10 ** 6 if ts_flag else (self.max_aicore_ts - self.min_aicore_ts) / 10 ** 6 + self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - + self.parallel_time) / 10 ** 6 + time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \ + self.profiling_info.communication_not_overlapped + if self.npu_step_time: + self.profiling_info.scheduling_time = self.npu_step_time - time_required + else: + self.profiling_info.scheduling_time = self.profiling_info.e2e_time - time_required + self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time \ + if self.profiling_info.e2e_time != 0 else 0 + + def parse_npu_csv_events(self): + if not self.npu_summary_file: + print('Npu op summary csv file is not available.') + return + info = pd.read_csv(self.npu_summary_file, index_col=None) + cube_time = 0.0 + vec_time = 0.0 + ai_core_time = 0.0 + vec_mac_flag = True # True标记当前summary文件中存在pmu信息 + if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: + print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:') + print(self.cube_op_type) + vec_mac_flag = False + for i in range(len(info['Model ID'])): + task_type = info.loc[i, 'Task Type'] + if task_type not in ['AI_CORE']: + continue + task_durations = info.loc[i, 'Task Duration(us)'] + ai_core_time += task_durations + op_type = info.loc[i, 'OP Type'] + if not vec_mac_flag: # 如果是极简模式根据OP_Type计算完cube time后提前返回 + cube_time += task_durations if op_type in self.cube_op_type else 0.0 + continue + aiv_vec_time = info.loc[i, 'aiv_vec_time(us)'] + if aiv_vec_time > 0: + vec_time += task_durations + + if vec_mac_flag: + cube_time = (ai_core_time - vec_time) / 10 ** 6 + vec_time /= 10 ** 6 + else: + vec_time = (ai_core_time - cube_time) / 10 ** 6 + cube_time /= 10 ** 6 + self.profiling_info.cube_time = cube_time + self.profiling_info.vector_time = vec_time + if not self.npu_mem_file: + print('Npu op memory csv file is not available.') + return + try: + info = pd.read_csv(self.npu_mem_file, usecols=['Total Reserved(MB)'], index_col=None) + except ValueError: + print('Npu profiling data does not contain memory info.') + else: + self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024 + + @staticmethod + def interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list): + ans = 0 + i = 0 + j = 0 + while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list): + lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0]) + hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1]) + if lo <= hi: + ans += (hi - lo) + if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]: + i += 1 + else: + j += 1 + return ans + + def get_ts_by_task_type(self, dic, event_wait_sqe, ai_core_dict, enent_wait_res, ai_core_res): + if not dic.get('args'): + return + args = dic.get('args') + if args.get('Stream Id'): + stream_id = args.get('Stream Id') + ts = dic.get('ts') + dur = dic.get('dur') + if args.get('Task Type') == 'EVENT_WAIT_SQE': + enent_wait_res[stream_id] += dur + event_wait_sqe[stream_id].append([ts, ts + dur]) + elif args.get('Task Type') == 'AI_CORE': + self.min_aicore_ts = ts if ts < self.min_aicore_ts else self.min_aicore_ts + self.max_aicore_ts = (ts + dur) if (ts + dur) > self.max_aicore_ts else self.max_aicore_ts + ai_core_res[stream_id] += dur + ai_core_dict[stream_id].append([ts, ts + dur]) diff --git a/profiler/compare_tools/profiling_analysis/parser_helper.py b/profiler/compare_tools/profiling_analysis/parser_helper.py new file mode 100644 index 0000000000..958a3146bb --- /dev/null +++ b/profiler/compare_tools/profiling_analysis/parser_helper.py @@ -0,0 +1,37 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os + + +class ProfilingInfo: + def __init__(self): + self.cube_time = 0.0 + self.vector_time = 0.0 + self.compute_time = 0.0 + self.communication_not_overlapped = 0.0 + self.scheduling_ratio = 0.0 + self.memory_used = 0.0 + self.e2e_time = 0.0 + self.scheduling_time = 0.0 + + +def read_json_file(path): + if not os.path.isfile(path): + raise ValueError(f'The path "{path}" is not a valid json file.') + with open(path, 'r', encoding='utf-8') as json_handler: + data = json.load(json_handler) + return data diff --git a/profiler/compare_tools/profiling_analysis/profiling_parse.py b/profiler/compare_tools/profiling_analysis/profiling_parse.py new file mode 100644 index 0000000000..10e3a6d305 --- /dev/null +++ b/profiler/compare_tools/profiling_analysis/profiling_parse.py @@ -0,0 +1,87 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +from prettytable import PrettyTable + +from profiling_analysis.gpu_parser import GpuProfilingParser +from profiling_analysis.npu_parser import NpuProfilingParser +from profiling_analysis.parser_helper import ProfilingInfo + + +def parse_command(): + parser = argparse.ArgumentParser() + parser.add_argument('-g', '--gpu', required=False, default='', metavar='(FILE)', help='Gpu profiling json file.') + parser.add_argument('-glt', '--gpu_log_time', required=False, default=0.0, type=float, help='Gpu one step time(s)') + parser.add_argument('-n', '--npu', required=False, default='', metavar='(FILE)', + help='Npu single core profiling root path.') + parser.add_argument('-nlt', '--npu_log_time', required=False, default=0.0, metavar='(FILE)', type=float, + help='Npu one step time(s).') + parser.add_argument('-aop', '--add_cube_op', required=False, default=[], nargs='*', help='add cube op name') + return parser.parse_args() + + +def show_table(gpu_profiling_info, npu_profiling_info): + table = PrettyTable() + table.title = '大模型性能拆解' + table.field_names = ['', 'cube算子', 'vector算子', '计算流耗时', '通信', '调度耗时', '调度占比', '内存', + 'E2E性能值'] + table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s', + f'{gpu_profiling_info.compute_time:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s', + f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}', + f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s']) + table.add_row(['当前现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s', + f'{npu_profiling_info.compute_time:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s', + f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}', + f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s']) + print(table) + + +def parse_gpu(gpu_path): + if gpu_path: + gpu_parser = GpuProfilingParser(gpu_path) + gpu_parser.parse_events() + return gpu_parser.profiling_info + print('Gpu trace json file is not specified.') + return ProfilingInfo() + + +def parse_npu(npu_path): + npu_parser = NpuProfilingParser(0, [], npu_path) + npu_parser.parse_npu_csv_events() + npu_parser.parse_npu_json_events() + return npu_parser.profiling_info + + +def prof_main(npu_path, gpu_path): + if not npu_path or not gpu_path: + return + + npu_dir = {'trace_view': None, 'memory_record': None, 'op_summary': None} + for root, _, files in os.walk(npu_path): + for file in files: + if file == 'trace_view.json': + npu_dir['trace_view'] = os.path.join(root, file) + if file == 'memory_record.csv': + npu_dir['memory_record'] = os.path.join(root, file) + if 'op_summary_' in file: + npu_dir['op_summary'] = os.path.join(root, file) + show_table(parse_gpu(gpu_path), parse_npu(npu_dir)) + + +if __name__ == '__main__': + prof_main() -- Gitee