diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md index d871f019827de5a1fa99368962d0938c54cd4a98..de39ab939eb075df115734cd9a0b34f7468de5ab 100644 --- a/debug/tools/profiling_analyse/README.md +++ b/debug/tools/profiling_analyse/README.md @@ -22,13 +22,16 @@ gpu上的内存使用可以使用nvidia-smi查看,使用json文件分析时需 1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。 2、当前仅统计算子运行在vector和cube上的耗时。 3、这2中算子于csv文件中的的TaskType均为AI_CORE,其中aiv_vec_time时间多表明为vector算子,aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。 +4、算子若无pmu信息,会根据cube算子类型进行区分,当前已知的算子类型为['MatMul', 'BatchMatMul'] +5、用户有添加cube算子的要求,可以使用可选入参'-aop'添加算子名称,使用方式见样例。 ### 通信 -此处的通信为通信未掩盖耗时,对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的EVENT_WAIT_SQE,对于多个Stream Id的结果,取Stream Id最小值。 -输出结果为该字段时间求和。 +1、此处的通信为通信未掩盖耗时,对应为ASCEND_PROFILER_OUTPUT/trace_view.json下同一条流的EVENT_WAIT_SQE总耗时。 +2、选取trace_view中的计算流——即流中同时存在EVENT_WAIT_SQE和Task Type为AI_CORE的流 +3、对于AI_CORE存在2条流中的情况,计算流中累加EVENT_WAIT_SQE时会减去同时间区间内另外流产生的AI_CORE耗时 ### 计算流e2e耗时 -此耗时通过统计trace_view.json中时间戳‘ts’的最小值和最大值,其时间差的绝对值即为e2e耗时。 +此耗时通过统计trace_view.json中compute_time时间戳‘ts’的最小值和最大值,其时间差的绝对值即为e2e耗时。 ### 调度占比 1、调度占比的求取需先计算调度耗时,调度占比=调度耗时/e2e耗时 * 100%。 @@ -42,5 +45,5 @@ gpu上的内存使用可以使用nvidia-smi查看,使用json文件分析时需 ## 样例 - step1:获取gpu和npu的profiling数据,若采集profiling数据时没开启memory采集开关,则没有内存使用数据 -- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2 -- 输出结果:可以得到gpu与npu对照的打屏性能拆解数据 +- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2 -aop op1 op2 +- 输出结果:可以得到gpu与npu对照的打屏性能拆解数据,其中-nlt为输入打屏时间,-aop为手动添加的cube算子类型 diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index 611343027b89643910ec7a93188c20c8f051091a..bd8d8350f2768c783d0183476588d6a94e1eaa52 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -5,73 +5,101 @@ import parser_helper class NpuProfilingParser: - def __init__(self, npu_step_time, npu_file_path): + def __init__(self, npu_step_time, add_cube_name, npu_file_path): self.npu_json_file = npu_file_path.get('trace_view') self.npu_summary_file = npu_file_path.get('op_summary') self.npu_mem_file = npu_file_path.get('memory_record') self.profiling_info = parser_helper.ProfilingInfo() self.npu_step_time = npu_step_time + self.parallel_time = 0 + self.aicore_time = 0 + self.cube_op_type = ['MatMul', 'BatchMatMul'] + self.cube_op_type = list(set(self.cube_op_type + add_cube_name)) def parse_npu_json_events(self): - event_wait_sqe = defaultdict(float) if not self.npu_json_file: print('Npu trace json file is not available.') return + compute_time = 0 min_ts = sys.float_info.max max_ts = sys.float_info.min + ts_flag = False # 表明没有获取到compute time的耗时 data = parser_helper.read_json_file(self.npu_json_file) + event_wait_sqe = defaultdict(list) + ai_core_dict = defaultdict(list) + event_wait_sqe_res = defaultdict(float) for dic in data: - if dic.get('name') == 'EVENT_WAIT_SQE': - args = dic.get('args') - stream_id = args.get('Stream Id') - event_wait_sqe[stream_id] += dic.get('dur') - if dic.get('ts'): + self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res) + if ('name' in dic) and (dic.get('name') == 'compute_time'): + ts_flag = True ts = dic.get('ts') + dur = dic.get('dur') + compute_time += dur min_ts = ts if ts < min_ts else min_ts - max_ts = ts if ts > max_ts else max_ts - self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 / 1000 - self.profiling_info.communication_not_overlapped = event_wait_sqe.get(min(event_wait_sqe)) / 1000 / 1000 + max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts + # AI_CORE和EVENT_WAIT_SQE共存为计算流 + compute_stream = [] + parallel_stream = [] + # 不存在算子并行的情况 + if len(ai_core_dict) == 1: + compute_stream.append(min(ai_core_dict.keys())) + elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) + compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) + parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) + cs_event_wait_sqe_list = event_wait_sqe[compute_stream[0]] + if parallel_stream: + cs_ai_core_list = ai_core_dict[parallel_stream[0]] + sorted(cs_event_wait_sqe_list, key=lambda x: (x[0])) + sorted(cs_ai_core_list, key=lambda x: (x[0])) + self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) + self.profiling_info.compute_time = compute_time / 10 ** 6 + self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 ** 6 if ts_flag else 0 + self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - + self.parallel_time) / 10 ** 6 time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \ self.profiling_info.communication_not_overlapped if self.npu_step_time: self.profiling_info.scheduling_time = self.npu_step_time - time_required else: self.profiling_info.scheduling_time = self.profiling_info.e2e_time - time_required - self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time + self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time \ + if self.profiling_info.e2e_time != 0 else 0 def parse_npu_csv_events(self): if not self.npu_summary_file: print('Npu op summary csv file is not available.') return info = pd.read_csv(self.npu_summary_file, index_col=None) - op_statics_result = {} cube_time = 0.0 vec_time = 0.0 - length = len(info['Model ID']) + ai_core_time = 0.0 + vec_mac_flag = True # True标记当前summary文件中存在pmu信息 if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: - raise ValueError('There is no cube time or vector time in the csv!The aic_mac_time(us) and ' - 'aiv_vec_time(us) are necessary for the determination of cube and vector.') - - for i in range(length): - op_type = info.loc[i, 'OP Type'] + print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:') + print(self.cube_op_type) + vec_mac_flag = False + for i in range(len(info['Model ID'])): task_type = info.loc[i, 'Task Type'] + if task_type not in ['AI_CORE']: + continue task_durations = info.loc[i, 'Task Duration(us)'] - aic_mac_time = info.loc[i, 'aic_mac_time(us)'] + ai_core_time += task_durations + op_type = info.loc[i, 'OP Type'] + if not vec_mac_flag: # 如果是极简模式根据OP_Type计算完cube time后提前返回 + cube_time += task_durations if op_type in self.cube_op_type else 0.0 + continue aiv_vec_time = info.loc[i, 'aiv_vec_time(us)'] - - if task_type in ['AI_CORE']: - if aiv_vec_time > aic_mac_time: - vec_time += task_durations - if op_statics_result.get(op_type) is None: - op_statics_result[op_type] = [task_durations, 'vector'] - else: - op_statics_result[op_type][0] += task_durations - else: - cube_time += task_durations - if op_statics_result.get(op_type) is None: - op_statics_result[op_type] = [task_durations, 'cube'] - else: - op_statics_result[op_type][0] += task_durations + if aiv_vec_time > 0: + vec_time += task_durations + + if vec_mac_flag: + cube_time = (ai_core_time - vec_time) / 10 ** 6 + vec_time /= 10 ** 6 + else: + vec_time = (ai_core_time - cube_time) / 10 ** 6 + cube_time /= 10 ** 6 + self.profiling_info.cube_time = cube_time + self.profiling_info.vector_time = vec_time if not self.npu_mem_file: print('Npu op memory csv file is not available.') return @@ -81,5 +109,34 @@ class NpuProfilingParser: print('Npu profiling data does not contain memory info.') else: self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024 - self.profiling_info.cube_time = cube_time / 10 ** 6 - self.profiling_info.vector_time = vec_time / 10 ** 6 + + @staticmethod + def interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list): + ans = 0 + i = 0 + j = 0 + while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list): + lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0]) + hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1]) + if lo <= hi: + ans += (hi - lo) + if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]: + i += 1 + else: + j += 1 + return ans + + @staticmethod + def get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, res): + if not dic.get('args'): + return + args = dic.get('args') + if args.get('Stream Id'): + stream_id = args.get('Stream Id') + ts = dic.get('ts') + dur = dic.get('dur') + if args.get('Task Type') == 'EVENT_WAIT_SQE': + res[stream_id] += dur + event_wait_sqe[stream_id].append([ts, ts + dur]) + elif args.get('Task Type') == 'AI_CORE': + ai_core_dict[stream_id].append([ts, ts + dur]) diff --git a/debug/tools/profiling_analyse/parser_helper.py b/debug/tools/profiling_analyse/parser_helper.py index 1477db6bcee6a3edf98a8fb8f7494f1f98556bb9..6b91fdd5342c91e8ea251e0232e08bfc44012593 100644 --- a/debug/tools/profiling_analyse/parser_helper.py +++ b/debug/tools/profiling_analyse/parser_helper.py @@ -6,7 +6,7 @@ class ProfilingInfo: def __init__(self): self.cube_time = 0.0 self.vector_time = 0.0 - self.large_kernel = 0.0 + self.compute_time = 0.0 self.communication_not_overlapped = 0.0 self.scheduling_ratio = 0.0 self.memory_used = 0.0 diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py index f3e891473728008dd0f070f080451b1c61ef8244..2f773495344e4f29a4d691d6818abf9b30e31a34 100644 --- a/debug/tools/profiling_analyse/profiling_parse.py +++ b/debug/tools/profiling_analyse/profiling_parse.py @@ -16,20 +16,21 @@ def parse_command(): help='Npu single core profiling root path.') parser.add_argument('-nlt', '--npu_log_time', required=False, default=0.0, metavar='(FILE)', type=float, help='Npu one step time(s).') + parser.add_argument('-aop', '--add_cube_op', required=False, default=[], nargs='*', help='add cube op name') return parser.parse_args() def show_table(gpu_profiling_info, npu_profiling_info): table = PrettyTable() table.title = '大模型性能拆解' - table.field_names = ['', 'cube算子', 'vector算子', '大kernel算子', '通信', '调度耗时', '调度占比', '内存', + table.field_names = ['', 'cube算子', 'vector算子', '计算流耗时', '通信', '调度耗时', '调度占比', '内存', 'E2E性能值'] table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s', - f'{gpu_profiling_info.large_kernel:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s', + f'{gpu_profiling_info.compute_time:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s', f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}', f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s']) table.add_row(['当前现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s', - f'{npu_profiling_info.large_kernel:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s', + f'{npu_profiling_info.compute_time:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s', f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}', f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s']) print(table) @@ -47,7 +48,7 @@ def parse_gpu(args): def parse_npu(args, npu_path): - npu_parser = NpuProfilingParser(args.npu_log_time, npu_path) + npu_parser = NpuProfilingParser(args.npu_log_time, args.add_cube_op, npu_path) npu_parser.parse_npu_csv_events() npu_parser.parse_npu_json_events() return npu_parser.profiling_info