diff --git a/profiler/performance_analyse/README.md b/profiler/performance_analyse/README.md deleted file mode 100644 index c769a0f25c27a92d5aac3c0d66608c3f66807907..0000000000000000000000000000000000000000 --- a/profiler/performance_analyse/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# 性能分析工具 - -## 大模型性能拆解 -### GPU性能拆解 -#### 算子耗时 -包含cube算子耗时和vector算子耗时 -#### 计算流耗时: -gpu计算流所有event耗时总和 -#### 通信 -gpu通信未掩盖耗时 -#### 调度 -调度耗时 = 单步打屏时间 - 算子耗时 - 通信不可掩盖耗时,其中单步打屏时间需要用户输入,当用户不输入时,采用e2e耗时代替单步打屏时间 -获得调度耗时后,使用调度占比 = 调度耗时/E2E耗时 获取调度占比 -#### 内存分析 -gpu上的内存使用可以使用nvidia-smi查看 -profiling信息采集时打开profile_memory=True开关,即可从json文件中读出运行稳定后的memory信息 -#### 计算流e2e耗时 -gpu计算流端到端耗时 -### npu性能拆解 -#### 算子耗时 -包含cube算子耗时和vector算子耗时 -#### 计算流耗时: -npu计算流所有event耗时总和 -#### 通信 -npu通信未掩盖耗时 -#### 调度 -调度耗时 = 单步打屏时间 - 算子耗时 - 通信不可掩盖耗时,其中单步打屏时间需要用户输入,当用户不输入时,采用e2e耗时代替单步打屏时间 -获得调度耗时后,使用调度占比 = 调度耗时/E2E耗时 获取调度占比 -#### 内存分析 -npu上的内存使用可以使用npu-smi查看 -profiling信息采集时打开profile_memory=True开关,即可从csv文件中读出运行稳定后的memory信息 -#### 计算流e2e耗时 -gpu计算流端到端耗时 -### 使用方法 -- 获取数据:获取gpu和npu的profiling数据,若采集profiling数据时没开启memory采集开关,则没有内存使用数据 -- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2 -aop op1 op2 -- 输出结果:可以得到gpu与npu对照的打屏性能拆解数据,其中-nlt为输入打屏时间,-aop为手动添加的cube算子类型 - -## 卡间不同步问题分析(实现中) -### GPU通信算子8卡同步情况可视化 -### NPU通信算子8卡同步情况可视化 - -## 更多分析功能规划中 \ No newline at end of file diff --git a/profiler/performance_analyse/__init__.py b/profiler/performance_analyse/__init__.py deleted file mode 100644 index 8400fd5ecd1246eaee795cebfccfacc80a94f08c..0000000000000000000000000000000000000000 --- a/profiler/performance_analyse/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/profiler/performance_analyse/gpu_parser.py b/profiler/performance_analyse/gpu_parser.py deleted file mode 100644 index 95391dc0ba9dd77020976a8843b1f343368820af..0000000000000000000000000000000000000000 --- a/profiler/performance_analyse/gpu_parser.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import Counter, defaultdict -import pandas as pd - -import parser_helper - - -class GpuProfilingParser: - def __init__(self, args): - self.trace_events = self.read_profiling_json_file(args.gpu) - self.compute_stream_id = self.infer_compute_stream_id() - self.one_step_time = args.gpu_log_time - self.profiling_info = parser_helper.ProfilingInfo() - - @staticmethod - def read_profiling_json_file(json_path): - data = parser_helper.read_json_file(json_path) - if 'traceEvents' not in data: - raise RuntimeError("The gpu profiling json doesn't contain traceEvents data.") - return data.get('traceEvents') - - def parse_events(self): - cube_time = 0.0 - all_op_time = 0.0 - op_list = [] - compute_stream_dur = 0.0 # 计算流耗时 - marks = defaultdict(int) # mark for compute communication_not_overlapped time - - for event in self.trace_events: - if not isinstance(event, dict): - continue - if event.get('args') and event.get('args').get('stream') == self.compute_stream_id: - compute_stream_dur += float(event.get('dur')) - if not {'name', 'cat', 'dur', 'ts'} < event.keys(): - continue - name = event.get('name') - dur = event.get('dur') - ts = event.get('ts') - cat = event.get('cat') - if cat.lower() != 'kernel': - continue - if 'nccl' in name: - for timestep in range(ts + 1, ts + dur + 1): - marks[str(timestep)] += 1 # mark this timestep in communication stream - continue - else: - for timestep in range(ts + 1, ts + dur + 1): - marks[str(timestep)] += -100 # mark this timestep in compute stream - if 'gemm' in name: - cube_time += float(dur) - all_op_time += float(dur) - op_list.append([ts, name, cat, dur]) - op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur']) - op_dataframe.to_csv('gpu_perf.csv', index=False) - self.profiling_info.compute_time = compute_stream_dur / 10 ** 6 - self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6 - self.profiling_info.cube_time = cube_time / 10 ** 6 - self.profiling_info.vector_time = (all_op_time - cube_time) / 10 ** 6 - self.parse_e2e_time() - if self.one_step_time: - self.profiling_info.scheduling_time = self.one_step_time - all_op_time / 10 ** 6 - \ - self.profiling_info.communication_not_overlapped - else: - self.profiling_info.scheduling_time = self.profiling_info.e2e_time - all_op_time / 10 ** 6 - \ - self.profiling_info.communication_not_overlapped - self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time - self.parse_memory_reserved() - - def parse_e2e_time(self): - compute_events_timeline = [event for event in self.trace_events if - event.get('args') and event.get('args').get('stream') == self.compute_stream_id] - compute_events_timeline = sorted(compute_events_timeline, key=lambda event: event.get('ts')) - self.profiling_info.e2e_time = (compute_events_timeline[-1].get('ts') + compute_events_timeline[-1].get('dur') - - compute_events_timeline[0].get('ts')) / 10 ** 6 - - def parse_memory_reserved(self): - memories = [ - event.get('args').get('Total Reserved') for event in self.trace_events - if event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0 - ] - if not memories: - print("Gpu profiling data doesn't contain memory info") - return - self.profiling_info.memory_used = max(memories) / 1024 ** 3 - - def infer_compute_stream_id(self): - kernel_stream_ids = [] - for event in self.trace_events: - is_kernel_exec_event = event.get('cat') == 'Kernel' and 'nccl' not in event.get('name') - has_stream_id_event = event.get('args') and event.get('args').get('stream') - if is_kernel_exec_event and has_stream_id_event: - kernel_stream_ids.append(event.get('args').get('stream')) - if not kernel_stream_ids: - raise RuntimeError('The profiling data does not contain kernel running data.') - counter = Counter(kernel_stream_ids) - return counter.most_common(1)[0][0] diff --git a/profiler/performance_analyse/npu_parser.py b/profiler/performance_analyse/npu_parser.py deleted file mode 100644 index 375dd85680ac07c6c537490395ec62ac4488aae0..0000000000000000000000000000000000000000 --- a/profiler/performance_analyse/npu_parser.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import pandas as pd -from collections import defaultdict -import parser_helper - - -class NpuProfilingParser: - def __init__(self, npu_step_time, add_cube_name, npu_file_path): - self.npu_json_file = npu_file_path.get('trace_view') - self.npu_summary_file = npu_file_path.get('op_summary') - self.npu_mem_file = npu_file_path.get('memory_record') - self.profiling_info = parser_helper.ProfilingInfo() - self.npu_step_time = npu_step_time - self.parallel_time = 0 - self.aicore_time = 0 - self.cube_op_type = ['MatMul', 'BatchMatMul'] - self.cube_op_type = list(set(self.cube_op_type + add_cube_name)) - self.min_aicore_ts = sys.float_info.max - self.max_aicore_ts = sys.float_info.min - - def parse_npu_json_events(self): - if not self.npu_json_file: - print('Npu trace json file is not available.') - return - compute_time = 0 - min_ts = sys.float_info.max - max_ts = sys.float_info.min - ts_flag = False # 表明没有获取到compute time的耗时 - data = parser_helper.read_json_file(self.npu_json_file) - event_wait_sqe = defaultdict(list) - ai_core_dict = defaultdict(list) - event_wait_sqe_res = defaultdict(float) - ai_core_res = defaultdict(float) - for dic in data: - self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res, ai_core_res) - if ('name' in dic) and (dic.get('name') == 'Compute'): - ts_flag = True - ts = dic.get('ts') - dur = dic.get('dur') - compute_time += dur - min_ts = ts if ts < min_ts else min_ts - max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts - # AI_CORE和EVENT_WAIT_SQE共存为计算流 - compute_stream = [] - parallel_stream = [] - # 不存在算子并行的情况 - if len(ai_core_dict) == 1: - compute_stream.append(min(ai_core_dict.keys())) - elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) - compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) - parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) - else: - print('Npu trace json file lack of Stream info') - return - cs_event_wait_sqe_list = event_wait_sqe[compute_stream[0]] - if parallel_stream: - cs_ai_core_list = ai_core_dict[parallel_stream[0]] - sorted(cs_event_wait_sqe_list, key=lambda x: (x[0])) - sorted(cs_ai_core_list, key=lambda x: (x[0])) - self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) - self.profiling_info.compute_time = compute_time / 10 ** 6 if ts_flag else ai_core_res[compute_stream[0]] / 10 ** 6 - self.profiling_info.e2e_time = (max_ts - min_ts) / 10 ** 6 if ts_flag else (self.max_aicore_ts - self.min_aicore_ts) / 10 ** 6 - self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - - self.parallel_time) / 10 ** 6 - time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \ - self.profiling_info.communication_not_overlapped - if self.npu_step_time: - self.profiling_info.scheduling_time = self.npu_step_time - time_required - else: - self.profiling_info.scheduling_time = self.profiling_info.e2e_time - time_required - self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time \ - if self.profiling_info.e2e_time != 0 else 0 - - def parse_npu_csv_events(self): - if not self.npu_summary_file: - print('Npu op summary csv file is not available.') - return - info = pd.read_csv(self.npu_summary_file, index_col=None) - cube_time = 0.0 - vec_time = 0.0 - ai_core_time = 0.0 - vec_mac_flag = True # True标记当前summary文件中存在pmu信息 - if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: - print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:') - print(self.cube_op_type) - vec_mac_flag = False - for i in range(len(info['Model ID'])): - task_type = info.loc[i, 'Task Type'] - if task_type not in ['AI_CORE']: - continue - task_durations = info.loc[i, 'Task Duration(us)'] - ai_core_time += task_durations - op_type = info.loc[i, 'OP Type'] - if not vec_mac_flag: # 如果是极简模式根据OP_Type计算完cube time后提前返回 - cube_time += task_durations if op_type in self.cube_op_type else 0.0 - continue - aiv_vec_time = info.loc[i, 'aiv_vec_time(us)'] - if aiv_vec_time > 0: - vec_time += task_durations - - if vec_mac_flag: - cube_time = (ai_core_time - vec_time) / 10 ** 6 - vec_time /= 10 ** 6 - else: - vec_time = (ai_core_time - cube_time) / 10 ** 6 - cube_time /= 10 ** 6 - self.profiling_info.cube_time = cube_time - self.profiling_info.vector_time = vec_time - if not self.npu_mem_file: - print('Npu op memory csv file is not available.') - return - try: - info = pd.read_csv(self.npu_mem_file, usecols=['Total Reserved(MB)'], index_col=None) - except ValueError: - print('Npu profiling data does not contain memory info.') - else: - self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024 - - @staticmethod - def interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list): - ans = 0 - i = 0 - j = 0 - while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list): - lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0]) - hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1]) - if lo <= hi: - ans += (hi - lo) - if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]: - i += 1 - else: - j += 1 - return ans - - def get_ts_by_task_type(self, dic, event_wait_sqe, ai_core_dict, enent_wait_res, ai_core_res): - if not dic.get('args'): - return - args = dic.get('args') - if args.get('Stream Id'): - stream_id = args.get('Stream Id') - ts = dic.get('ts') - dur = dic.get('dur') - if args.get('Task Type') == 'EVENT_WAIT_SQE': - enent_wait_res[stream_id] += dur - event_wait_sqe[stream_id].append([ts, ts + dur]) - elif args.get('Task Type') == 'AI_CORE': - self.min_aicore_ts = ts if ts < self.min_aicore_ts else self.min_aicore_ts - self.max_aicore_ts = (ts + dur) if (ts + dur) > self.max_aicore_ts else self.max_aicore_ts - ai_core_res[stream_id] += dur - ai_core_dict[stream_id].append([ts, ts + dur]) diff --git a/profiler/performance_analyse/parser_helper.py b/profiler/performance_analyse/parser_helper.py deleted file mode 100644 index 958a3146bb58898cdb76003f5f59476a45c1593f..0000000000000000000000000000000000000000 --- a/profiler/performance_analyse/parser_helper.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - - -class ProfilingInfo: - def __init__(self): - self.cube_time = 0.0 - self.vector_time = 0.0 - self.compute_time = 0.0 - self.communication_not_overlapped = 0.0 - self.scheduling_ratio = 0.0 - self.memory_used = 0.0 - self.e2e_time = 0.0 - self.scheduling_time = 0.0 - - -def read_json_file(path): - if not os.path.isfile(path): - raise ValueError(f'The path "{path}" is not a valid json file.') - with open(path, 'r', encoding='utf-8') as json_handler: - data = json.load(json_handler) - return data diff --git a/profiler/performance_analyse/profiling_parse.py b/profiler/performance_analyse/profiling_parse.py deleted file mode 100644 index c45c73e9e27b6b4b4c9aae222499404b3ad3eac4..0000000000000000000000000000000000000000 --- a/profiler/performance_analyse/profiling_parse.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -from prettytable import PrettyTable - -from gpu_parser import GpuProfilingParser -from npu_parser import NpuProfilingParser -from parser_helper import ProfilingInfo - - -def parse_command(): - parser = argparse.ArgumentParser() - parser.add_argument('-g', '--gpu', required=False, default='', metavar='(FILE)', help='Gpu profiling json file.') - parser.add_argument('-glt', '--gpu_log_time', required=False, default=0.0, type=float, help='Gpu one step time(s)') - parser.add_argument('-n', '--npu', required=False, default='', metavar='(FILE)', - help='Npu single core profiling root path.') - parser.add_argument('-nlt', '--npu_log_time', required=False, default=0.0, metavar='(FILE)', type=float, - help='Npu one step time(s).') - parser.add_argument('-aop', '--add_cube_op', required=False, default=[], nargs='*', help='add cube op name') - return parser.parse_args() - - -def show_table(gpu_profiling_info, npu_profiling_info): - table = PrettyTable() - table.title = '大模型性能拆解' - table.field_names = ['', 'cube算子', 'vector算子', '计算流耗时', '通信', '调度耗时', '调度占比', '内存', - 'E2E性能值'] - table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s', - f'{gpu_profiling_info.compute_time:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s', - f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}', - f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s']) - table.add_row(['NPU现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s', - f'{npu_profiling_info.compute_time:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s', - f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}', - f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s']) - print(table) - - -def parse_gpu(args): - if args.gpu: - if args.gpu_log_time < 0: - raise ValueError("Gpu one step time shouldn't less than 0.") - gpu_parser = GpuProfilingParser(args) - gpu_parser.parse_events() - return gpu_parser.profiling_info - print('Gpu trace json file is not specified.') - return ProfilingInfo() - - -def parse_npu(args, npu_path): - npu_parser = NpuProfilingParser(args.npu_log_time, args.add_cube_op, npu_path) - npu_parser.parse_npu_csv_events() - npu_parser.parse_npu_json_events() - return npu_parser.profiling_info - - -def main(): - args = parse_command() - npu_path = {'trace_view': None, 'memory_record': None, 'op_summary': None} - for root, _, files in os.walk(args.npu): - for file in files: - if file == 'trace_view.json': - npu_path['trace_view'] = os.path.join(root, file) - if file == 'memory_record.csv': - npu_path['memory_record'] = os.path.join(root, file) - if 'op_summary_' in file: - npu_path['op_summary'] = os.path.join(root, file) - show_table(parse_gpu(args), parse_npu(args, npu_path)) - - -if __name__ == '__main__': - main()