From 7ddbf345105ca9044fa81b123ac00cd1090dc283 Mon Sep 17 00:00:00 2001 From: menff Date: Mon, 10 Jul 2023 16:58:50 +0800 Subject: [PATCH 1/7] =?UTF-8?q?=E5=AE=8C=E5=96=84=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E6=83=85=E5=86=B5=EF=BC=8C=E4=BF=AE=E6=AD=A3=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E6=96=B9=E5=BC=8F=EF=BC=8C=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E5=B9=B6=E8=A1=8C=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/tools/profiling_analyse/npu_parser.py | 110 +++++++++++++----- .../tools/profiling_analyse/parser_helper.py | 2 +- .../profiling_analyse/profiling_parse.py | 6 +- 3 files changed, 83 insertions(+), 35 deletions(-) diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index 611343027..13ab4c9cc 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -11,26 +11,46 @@ class NpuProfilingParser: self.npu_mem_file = npu_file_path.get('memory_record') self.profiling_info = parser_helper.ProfilingInfo() self.npu_step_time = npu_step_time + self.parallel_time = 0 + self.aicore_time = 0 def parse_npu_json_events(self): - event_wait_sqe = defaultdict(float) if not self.npu_json_file: print('Npu trace json file is not available.') return + compute_time = 0 min_ts = sys.float_info.max max_ts = sys.float_info.min data = parser_helper.read_json_file(self.npu_json_file) + event_wait_sqe = defaultdict(list) + ai_core_dict = defaultdict(list) + event_wait_sqe_res = defaultdict(float) for dic in data: - if dic.get('name') == 'EVENT_WAIT_SQE': - args = dic.get('args') - stream_id = args.get('Stream Id') - event_wait_sqe[stream_id] += dic.get('dur') - if dic.get('ts'): + self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res) + if ('name' in dic) and (dic.get('name') == 'compute_time'): + compute_time += dic.get('dur') ts = dic.get('ts') min_ts = ts if ts < min_ts else min_ts max_ts = ts if ts > max_ts else max_ts + # AI_CORE和EVENT_WAIT_SQE共存为计算流 + compute_stream = [] + parallel_stream = [] + # 不存在算子并行的情况 + if len(ai_core_dict) == 1: + compute_stream.appen(min(ai_core_dict.keys())) + elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) + compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) + parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) + cs_event_wait_sqe_list = event_wait_sqe[compute_stream[0]] + if parallel_stream: + cs_ai_core_list = ai_core_dict[parallel_stream[0]] + sorted(cs_event_wait_sqe_list, key=lambda x: (x[0])) + sorted(cs_ai_core_list, key=lambda x: (x[0])) + self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) + self.profiling.compute_time = compute_time / 10 ** 6 self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 / 1000 - self.profiling_info.communication_not_overlapped = event_wait_sqe.get(min(event_wait_sqe)) / 1000 / 1000 + self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - + self.parallel_time) / 10 ** 6 time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \ self.profiling_info.communication_not_overlapped if self.npu_step_time: @@ -44,34 +64,32 @@ class NpuProfilingParser: print('Npu op summary csv file is not available.') return info = pd.read_csv(self.npu_summary_file, index_col=None) - op_statics_result = {} cube_time = 0.0 vec_time = 0.0 - length = len(info['Model ID']) + ai_core_time = 0.0 + vec_mac_flag = True # True标记当前summary文件中存在pmu信息 if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: - raise ValueError('There is no cube time or vector time in the csv!The aic_mac_time(us) and ' - 'aiv_vec_time(us) are necessary for the determination of cube and vector.') - - for i in range(length): - op_type = info.loc[i, 'OP Type'] + print('当前的profiling结果可能是极简模式,无法区分cube和vector,总的ai_core耗时会展示在vector算子列') + vec_mac_flag = False + for i in range(len(info['Model ID'])): task_type = info.loc[i, 'Task Type'] + if task_type not in ['AI_CORE']: + continue task_durations = info.loc[i, 'Task Duration(us)'] - aic_mac_time = info.loc[i, 'aic_mac_time(us)'] + ai_core_time += task_durations + if not vec_mac_flag: + continue aiv_vec_time = info.loc[i, 'aiv_vec_time(us)'] - - if task_type in ['AI_CORE']: - if aiv_vec_time > aic_mac_time: - vec_time += task_durations - if op_statics_result.get(op_type) is None: - op_statics_result[op_type] = [task_durations, 'vector'] - else: - op_statics_result[op_type][0] += task_durations - else: - cube_time += task_durations - if op_statics_result.get(op_type) is None: - op_statics_result[op_type] = [task_durations, 'cube'] - else: - op_statics_result[op_type][0] += task_durations + if aiv_vec_time > 0: + vec_time += task_durations + + if vec_mac_flag: + cube_time = (ai_core_time - vec_time) / 10 ** 6 + vec_time /= 10 ** 6 + else: + vec_time = ai_core_time / 10 ** 6 + self.profiling_info.cube_time = cube_time + self.profiling_info.vector_time = vec_time if not self.npu_mem_file: print('Npu op memory csv file is not available.') return @@ -81,5 +99,35 @@ class NpuProfilingParser: print('Npu profiling data does not contain memory info.') else: self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024 - self.profiling_info.cube_time = cube_time / 10 ** 6 - self.profiling_info.vector_time = vec_time / 10 ** 6 + + @staticmethod + def interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list): + ans = 0 + i = 0 + j = 0 + while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list): + lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0]) + hi = max(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1]) + if lo < hi: + ans += (hi - lo) + if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]: + i += 1 + else: + j += 1 + return ans + + + @staticmethod + def get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, res): + if not dic.get('args'): + return + args = dic.get('args') + if args.get('Stream Id'): + stream_id = args.get('Stream Id') + ts = dic.get('ts') + dur = dic.get('dur') + if args.get('Task Type') == 'EVENT_WAIT_SQE': + res[stream_id] += dur + event_wait_sqe[stream_id].append([ts, ts + dur]) + elif: + ai_core_dict[stream_id].append([ts, ts + dur]) \ No newline at end of file diff --git a/debug/tools/profiling_analyse/parser_helper.py b/debug/tools/profiling_analyse/parser_helper.py index 1477db6bc..6b91fdd53 100644 --- a/debug/tools/profiling_analyse/parser_helper.py +++ b/debug/tools/profiling_analyse/parser_helper.py @@ -6,7 +6,7 @@ class ProfilingInfo: def __init__(self): self.cube_time = 0.0 self.vector_time = 0.0 - self.large_kernel = 0.0 + self.compute_time = 0.0 self.communication_not_overlapped = 0.0 self.scheduling_ratio = 0.0 self.memory_used = 0.0 diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py index f3e891473..b5dc9c385 100644 --- a/debug/tools/profiling_analyse/profiling_parse.py +++ b/debug/tools/profiling_analyse/profiling_parse.py @@ -22,14 +22,14 @@ def parse_command(): def show_table(gpu_profiling_info, npu_profiling_info): table = PrettyTable() table.title = '大模型性能拆解' - table.field_names = ['', 'cube算子', 'vector算子', '大kernel算子', '通信', '调度耗时', '调度占比', '内存', + table.field_names = ['', 'cube算子', 'vector算子', '计算流耗时', '通信', '调度耗时', '调度占比', '内存', 'E2E性能值'] table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s', - f'{gpu_profiling_info.large_kernel:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s', + f'{gpu_profiling_info.compute_time:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s', f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}', f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s']) table.add_row(['当前现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s', - f'{npu_profiling_info.large_kernel:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s', + f'{npu_profiling_info.compute_time:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s', f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}', f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s']) print(table) -- Gitee From c99f2e7f8129daea9a23396a79144d51697c5aee Mon Sep 17 00:00:00 2001 From: menff Date: Mon, 10 Jul 2023 17:10:15 +0800 Subject: [PATCH 2/7] =?UTF-8?q?=E6=9B=B4=E6=96=B0readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/tools/profiling_analyse/README.md | 8 +++++--- debug/tools/profiling_analyse/npu_parser.py | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md index d871f0198..5a6fd5e15 100644 --- a/debug/tools/profiling_analyse/README.md +++ b/debug/tools/profiling_analyse/README.md @@ -22,13 +22,15 @@ gpu上的内存使用可以使用nvidia-smi查看,使用json文件分析时需 1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。 2、当前仅统计算子运行在vector和cube上的耗时。 3、这2中算子于csv文件中的的TaskType均为AI_CORE,其中aiv_vec_time时间多表明为vector算子,aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。 +4、算子若无pmu信息,仅统计ai_core的总耗时并显示在结果"vector算子"一列 ### 通信 -此处的通信为通信未掩盖耗时,对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的EVENT_WAIT_SQE,对于多个Stream Id的结果,取Stream Id最小值。 -输出结果为该字段时间求和。 +1、此处的通信为通信未掩盖耗时,对应为ASCEND_PROFILER_OUTPUT/trace_view.json下同一条流的EVENT_WAIT_SQE总耗时。 +2、选取trace_view中的计算流——即流中同时存在EVENT_WAIT_SQE和Task Type为AI_CORE的流 +3、对于AI_CORE存在2条流中的情况,计算流中累加EVENT_WAIT_SQE时会减去同时间区间内另外流产生的AI_CORE耗时 ### 计算流e2e耗时 -此耗时通过统计trace_view.json中时间戳‘ts’的最小值和最大值,其时间差的绝对值即为e2e耗时。 +此耗时通过统计trace_view.json中compute_time时间戳‘ts’的最小值和最大值,其时间差的绝对值即为e2e耗时。 ### 调度占比 1、调度占比的求取需先计算调度耗时,调度占比=调度耗时/e2e耗时 * 100%。 diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index 13ab4c9cc..9bf051509 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -28,10 +28,11 @@ class NpuProfilingParser: for dic in data: self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res) if ('name' in dic) and (dic.get('name') == 'compute_time'): - compute_time += dic.get('dur') ts = dic.get('ts') + dur = dic.get('dur') + compute_time += dur min_ts = ts if ts < min_ts else min_ts - max_ts = ts if ts > max_ts else max_ts + max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts # AI_CORE和EVENT_WAIT_SQE共存为计算流 compute_stream = [] parallel_stream = [] -- Gitee From ef94381cc56954821500cfd6682c4c0fe0190f04 Mon Sep 17 00:00:00 2001 From: menff Date: Mon, 10 Jul 2023 17:22:10 +0800 Subject: [PATCH 3/7] =?UTF-8?q?=E4=BF=AE=E6=AD=A3bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/tools/profiling_analyse/npu_parser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index 9bf051509..9ccd73fff 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -38,7 +38,7 @@ class NpuProfilingParser: parallel_stream = [] # 不存在算子并行的情况 if len(ai_core_dict) == 1: - compute_stream.appen(min(ai_core_dict.keys())) + compute_stream.append(min(ai_core_dict.keys())) elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) @@ -48,7 +48,7 @@ class NpuProfilingParser: sorted(cs_event_wait_sqe_list, key=lambda x: (x[0])) sorted(cs_ai_core_list, key=lambda x: (x[0])) self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) - self.profiling.compute_time = compute_time / 10 ** 6 + self.profiling_info.compute_time = compute_time / 10 ** 6 self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 / 1000 self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - self.parallel_time) / 10 ** 6 @@ -108,8 +108,8 @@ class NpuProfilingParser: j = 0 while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list): lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0]) - hi = max(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1]) - if lo < hi: + hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1]) + if lo <= hi: ans += (hi - lo) if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]: i += 1 -- Gitee From 7eeba3f7985ed481b85709e95c09605f44d85a34 Mon Sep 17 00:00:00 2001 From: menff Date: Mon, 10 Jul 2023 17:28:26 +0800 Subject: [PATCH 4/7] bugfix --- debug/tools/profiling_analyse/npu_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index 9ccd73fff..501bef19a 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -130,5 +130,5 @@ class NpuProfilingParser: if args.get('Task Type') == 'EVENT_WAIT_SQE': res[stream_id] += dur event_wait_sqe[stream_id].append([ts, ts + dur]) - elif: + elif args.get('Task Type') == 'AI_CORE': ai_core_dict[stream_id].append([ts, ts + dur]) \ No newline at end of file -- Gitee From b6e6c5693fd891aee47711d3c9f3af88f1703bb4 Mon Sep 17 00:00:00 2001 From: menff Date: Mon, 10 Jul 2023 20:07:21 +0800 Subject: [PATCH 5/7] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=97=A0pmu=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E6=97=B6cube=E7=AE=97=E5=AD=90=E7=99=BD=E5=90=8D?= =?UTF-8?q?=E5=8D=95=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/tools/profiling_analyse/README.md | 7 ++++--- debug/tools/profiling_analyse/npu_parser.py | 16 +++++++++++----- debug/tools/profiling_analyse/profiling_parse.py | 3 ++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md index 5a6fd5e15..de39ab939 100644 --- a/debug/tools/profiling_analyse/README.md +++ b/debug/tools/profiling_analyse/README.md @@ -22,7 +22,8 @@ gpu上的内存使用可以使用nvidia-smi查看,使用json文件分析时需 1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。 2、当前仅统计算子运行在vector和cube上的耗时。 3、这2中算子于csv文件中的的TaskType均为AI_CORE,其中aiv_vec_time时间多表明为vector算子,aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。 -4、算子若无pmu信息,仅统计ai_core的总耗时并显示在结果"vector算子"一列 +4、算子若无pmu信息,会根据cube算子类型进行区分,当前已知的算子类型为['MatMul', 'BatchMatMul'] +5、用户有添加cube算子的要求,可以使用可选入参'-aop'添加算子名称,使用方式见样例。 ### 通信 1、此处的通信为通信未掩盖耗时,对应为ASCEND_PROFILER_OUTPUT/trace_view.json下同一条流的EVENT_WAIT_SQE总耗时。 @@ -44,5 +45,5 @@ gpu上的内存使用可以使用nvidia-smi查看,使用json文件分析时需 ## 样例 - step1:获取gpu和npu的profiling数据,若采集profiling数据时没开启memory采集开关,则没有内存使用数据 -- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2 -- 输出结果:可以得到gpu与npu对照的打屏性能拆解数据 +- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2 -aop op1 op2 +- 输出结果:可以得到gpu与npu对照的打屏性能拆解数据,其中-nlt为输入打屏时间,-aop为手动添加的cube算子类型 diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index 501bef19a..ede8e8763 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -5,7 +5,7 @@ import parser_helper class NpuProfilingParser: - def __init__(self, npu_step_time, npu_file_path): + def __init__(self, npu_step_time, add_cube_time, npu_file_path): self.npu_json_file = npu_file_path.get('trace_view') self.npu_summary_file = npu_file_path.get('op_summary') self.npu_mem_file = npu_file_path.get('memory_record') @@ -13,6 +13,8 @@ class NpuProfilingParser: self.npu_step_time = npu_step_time self.parallel_time = 0 self.aicore_time = 0 + self.cube_op_type = ['MatMul', 'BatchMatMul'] + self.cube_op_type = list(set(self.cube_op_type + add_cube_time)) def parse_npu_json_events(self): if not self.npu_json_file: @@ -70,7 +72,8 @@ class NpuProfilingParser: ai_core_time = 0.0 vec_mac_flag = True # True标记当前summary文件中存在pmu信息 if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: - print('当前的profiling结果可能是极简模式,无法区分cube和vector,总的ai_core耗时会展示在vector算子列') + print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:') + print(cube_op_type) vec_mac_flag = False for i in range(len(info['Model ID'])): task_type = info.loc[i, 'Task Type'] @@ -78,7 +81,9 @@ class NpuProfilingParser: continue task_durations = info.loc[i, 'Task Duration(us)'] ai_core_time += task_durations - if not vec_mac_flag: + op_type = info.loc[i, 'OP Type'] + if not vec_mac_flag: # 如果是极简模式根据OP_Type计算完cube time后提前返回 + cube_time += task_durations if op_type in self.cube_op_type else 0.0 continue aiv_vec_time = info.loc[i, 'aiv_vec_time(us)'] if aiv_vec_time > 0: @@ -88,7 +93,8 @@ class NpuProfilingParser: cube_time = (ai_core_time - vec_time) / 10 ** 6 vec_time /= 10 ** 6 else: - vec_time = ai_core_time / 10 ** 6 + vec_time = (ai_core_time - cube_time) / 10 ** 6 + cube_time /= 10 ** 6 self.profiling_info.cube_time = cube_time self.profiling_info.vector_time = vec_time if not self.npu_mem_file: @@ -131,4 +137,4 @@ class NpuProfilingParser: res[stream_id] += dur event_wait_sqe[stream_id].append([ts, ts + dur]) elif args.get('Task Type') == 'AI_CORE': - ai_core_dict[stream_id].append([ts, ts + dur]) \ No newline at end of file + ai_core_dict[stream_id].append([ts, ts + dur]) diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py index b5dc9c385..2f7734953 100644 --- a/debug/tools/profiling_analyse/profiling_parse.py +++ b/debug/tools/profiling_analyse/profiling_parse.py @@ -16,6 +16,7 @@ def parse_command(): help='Npu single core profiling root path.') parser.add_argument('-nlt', '--npu_log_time', required=False, default=0.0, metavar='(FILE)', type=float, help='Npu one step time(s).') + parser.add_argument('-aop', '--add_cube_op', required=False, default=[], nargs='*', help='add cube op name') return parser.parse_args() @@ -47,7 +48,7 @@ def parse_gpu(args): def parse_npu(args, npu_path): - npu_parser = NpuProfilingParser(args.npu_log_time, npu_path) + npu_parser = NpuProfilingParser(args.npu_log_time, args.add_cube_op, npu_path) npu_parser.parse_npu_csv_events() npu_parser.parse_npu_json_events() return npu_parser.profiling_info -- Gitee From 95b2e5e76e7d36d6b4ae5f3e22e39f94bad741d3 Mon Sep 17 00:00:00 2001 From: menff Date: Mon, 10 Jul 2023 20:12:14 +0800 Subject: [PATCH 6/7] bugfix --- debug/tools/profiling_analyse/npu_parser.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index ede8e8763..54b2b506d 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -5,7 +5,7 @@ import parser_helper class NpuProfilingParser: - def __init__(self, npu_step_time, add_cube_time, npu_file_path): + def __init__(self, npu_step_time, add_cube_name, npu_file_path): self.npu_json_file = npu_file_path.get('trace_view') self.npu_summary_file = npu_file_path.get('op_summary') self.npu_mem_file = npu_file_path.get('memory_record') @@ -14,7 +14,7 @@ class NpuProfilingParser: self.parallel_time = 0 self.aicore_time = 0 self.cube_op_type = ['MatMul', 'BatchMatMul'] - self.cube_op_type = list(set(self.cube_op_type + add_cube_time)) + self.cube_op_type = list(set(self.cube_op_type + add_cube_name)) def parse_npu_json_events(self): if not self.npu_json_file: @@ -73,7 +73,7 @@ class NpuProfilingParser: vec_mac_flag = True # True标记当前summary文件中存在pmu信息 if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:') - print(cube_op_type) + print(self.cube_op_type) vec_mac_flag = False for i in range(len(info['Model ID'])): task_type = info.loc[i, 'Task Type'] @@ -123,7 +123,6 @@ class NpuProfilingParser: j += 1 return ans - @staticmethod def get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, res): if not dic.get('args'): -- Gitee From 2e0e76969966cf5be698532ebcc943644d5c8a99 Mon Sep 17 00:00:00 2001 From: menff Date: Mon, 10 Jul 2023 21:03:39 +0800 Subject: [PATCH 7/7] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E4=BF=9D=E6=8A=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/tools/profiling_analyse/npu_parser.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index 54b2b506d..bd8d8350f 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -23,6 +23,7 @@ class NpuProfilingParser: compute_time = 0 min_ts = sys.float_info.max max_ts = sys.float_info.min + ts_flag = False # 表明没有获取到compute time的耗时 data = parser_helper.read_json_file(self.npu_json_file) event_wait_sqe = defaultdict(list) ai_core_dict = defaultdict(list) @@ -30,6 +31,7 @@ class NpuProfilingParser: for dic in data: self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res) if ('name' in dic) and (dic.get('name') == 'compute_time'): + ts_flag = True ts = dic.get('ts') dur = dic.get('dur') compute_time += dur @@ -51,7 +53,7 @@ class NpuProfilingParser: sorted(cs_ai_core_list, key=lambda x: (x[0])) self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) self.profiling_info.compute_time = compute_time / 10 ** 6 - self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 / 1000 + self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 ** 6 if ts_flag else 0 self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - self.parallel_time) / 10 ** 6 time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \ @@ -60,7 +62,8 @@ class NpuProfilingParser: self.profiling_info.scheduling_time = self.npu_step_time - time_required else: self.profiling_info.scheduling_time = self.profiling_info.e2e_time - time_required - self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time + self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time \ + if self.profiling_info.e2e_time != 0 else 0 def parse_npu_csv_events(self): if not self.npu_summary_file: -- Gitee