From 6f00bb7c6dbaaf25e192e941fdd7c2f5cf901e83 Mon Sep 17 00:00:00 2001 From: wangshouce Date: Wed, 8 Nov 2023 18:39:46 +0800 Subject: [PATCH] free time fanum --- .../profiling_analysis/gpu_parser.py | 26 ++++++++++++++----- .../profiling_analysis/npu_parser.py | 11 ++++++-- .../profiling_analysis/parser_helper.py | 2 ++ .../profiling_analysis/profiling_parse.py | 12 ++++----- 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py index a9c8466725..8f1b6d9c03 100644 --- a/profiler/compare_tools/profiling_analysis/gpu_parser.py +++ b/profiler/compare_tools/profiling_analysis/gpu_parser.py @@ -34,7 +34,9 @@ class OpTimeWarper: compute_stream_dur: float = 0.0, cube_num: int = 0, vec_num: int = 0, - sdma_num: int = 0 + sdma_num: int = 0, + fa_num_bwd: int = 0, + fa_num_fwd: int = 0 ): self.cube_time = cube_time self.sdma_time = sdma_time @@ -46,6 +48,8 @@ class OpTimeWarper: self.cube_num = cube_num self.vec_num = vec_num self.sdma_num = sdma_num + self.fa_num_bwd = fa_num_bwd + self.fa_num_fwd = fa_num_fwd class GpuProfilingParser: @@ -82,6 +86,8 @@ class GpuProfilingParser: cube_num = 0 vec_num = 0 sdma_num = 0 + fa_num_bwd = 0 + fa_num_fwd = 0 compute_stream_dur = 0.0 for event in self.trace_events: if not isinstance(event, dict): @@ -94,9 +100,11 @@ class GpuProfilingParser: dur = event.get('dur') ts = event.get('ts') cat = event.get('cat', '') - if self.is_sdma_time(name): - sdma_time += float(dur) - sdma_num += 1 + if event.get('args') and event.get('args').get('stream') == self.compute_stream_id: + if self.is_sdma_time(name): + sdma_time += float(dur) + sdma_num += 1 + continue if cat.lower() != 'kernel': continue if self.NCCL_MARK in name.lower(): @@ -109,8 +117,10 @@ class GpuProfilingParser: if self.is_flash_attention(name): if 'bwd' in name.lower(): fa_time_bwd += float(dur) + fa_num_bwd += 1 else: fa_time_fwd += float(dur) + fa_num_fwd += 1 elif self.CUBE_MARK in name.lower(): cube_num += 1 cube_time += float(dur) @@ -129,7 +139,9 @@ class GpuProfilingParser: compute_stream_dur=compute_stream_dur, cube_num=cube_num, vec_num=vec_num, - sdma_num=sdma_num + sdma_num=sdma_num, + fa_num_bwd=fa_num_bwd, + fa_num_fwd=fa_num_fwd ) return time_wrapper @@ -158,10 +170,12 @@ class GpuProfilingParser: self.profiling_info.cube_num = cube_num self.profiling_info.vec_num = vec_num self.profiling_info.sdma_num = sdma_num + self.profiling_info.fa_num_bwd = time_wrapper.fa_num_bwd + self.profiling_info.fa_num_fwd = time_wrapper.fa_num_fwd self.profiling_info.sdma_time = sdma_time / 10 ** 6 self.parse_e2e_time() - self.profiling_info.scheduling_time = self.profiling_info.e2e_time - all_op_time / 10 ** 6 - \ + self.profiling_info.scheduling_time = self.profiling_info.e2e_time - self.profiling_info.compute_time - \ self.profiling_info.communication_not_overlapped if self.profiling_info.e2e_time < Constant.EPS: self.profiling_info.scheduling_ratio = 0.0 diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py index 8c9b3e75c0..89e903e7a8 100644 --- a/profiler/compare_tools/profiling_analysis/npu_parser.py +++ b/profiler/compare_tools/profiling_analysis/npu_parser.py @@ -152,9 +152,9 @@ class NpuProfilingParser: self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) self.profiling_info.compute_time = compute_time / 10 ** 6 if is_cluster else \ ai_core_res[compute_stream[0]] / 10 ** 6 - self.profiling_info.other_time = self.profiling_info.compute_time - self.profiling_info.cube_time - \ + self.profiling_info.other_time = max(0, self.profiling_info.compute_time - self.profiling_info.cube_time - \ self.profiling_info.flash_attention_time_fwd - self.profiling_info.flash_attention_time_bwd - \ - self.profiling_info.vec_time + self.profiling_info.vec_time) self.profiling_info.e2e_time = ts_dur / 10 ** 6 if is_cluster else \ (self.max_stream_ts - self.min_stream_ts) / 10 ** 6 self.profiling_info.communication_not_overlapped = communication_time / 10 ** 6 \ @@ -195,6 +195,8 @@ class NpuProfilingParser: fa_time_bwd = 0.0 cube_num = 0 vec_num = 0 + fa_num_bwd = 0 + fa_num_fwd = 0 if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: self.profiling_info.hide_op_details = True return @@ -207,8 +209,10 @@ class NpuProfilingParser: if self.FLASH_ATTENTION in op_type.lower(): if 'bwd' in op_type.lower() or 'grad' in op_type.lower(): fa_time_bwd += task_durations + fa_num_bwd += 1 else: fa_time_fwd += task_durations + fa_num_fwd += 1 elif aiv_vec_time > 0: vec_time += task_durations vec_num += 1 @@ -221,6 +225,9 @@ class NpuProfilingParser: self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6 self.profiling_info.cube_num = cube_num self.profiling_info.vec_num = vec_num + self.profiling_info.fa_num_bwd = fa_num_bwd + self.profiling_info.fa_num_fwd = fa_num_fwd + def parse_mem_csv(self): if not self.npu_mem_file: diff --git a/profiler/compare_tools/profiling_analysis/parser_helper.py b/profiler/compare_tools/profiling_analysis/parser_helper.py index 3b42d9308c..377ce18a10 100644 --- a/profiler/compare_tools/profiling_analysis/parser_helper.py +++ b/profiler/compare_tools/profiling_analysis/parser_helper.py @@ -25,6 +25,8 @@ class ProfilingInfo: self.vec_time = 0.0 self.cube_num = 0 self.vec_num = 0 + self.fa_num_fwd = 0 + self.fa_num_bwd = 0 self.compute_time = 0.0 self.communication_not_overlapped = 0.0 self.scheduling_ratio = 0.0 diff --git a/profiler/compare_tools/profiling_analysis/profiling_parse.py b/profiler/compare_tools/profiling_analysis/profiling_parse.py index 434fba8309..adf182900f 100644 --- a/profiler/compare_tools/profiling_analysis/profiling_parse.py +++ b/profiler/compare_tools/profiling_analysis/profiling_parse.py @@ -40,13 +40,13 @@ def generate_table_info(base_profiling_info, comp_profiling_info, table): base_col.append(f'{base_profiling_info.other_time:.3f}s') comp_col.append(f'{comp_profiling_info.other_time:.3f}s') if base_profiling_info.flash_attention_time_fwd or comp_profiling_info.flash_attention_time_fwd: - headers.append('Flash Attention Time(Forward)') - base_col.append(f'{base_profiling_info.flash_attention_time_fwd:.3f}s') - comp_col.append(f'{comp_profiling_info.flash_attention_time_fwd:.3f}s') + headers.append('Flash Attention Time(Forward)(Num)') + base_col.append(f'{base_profiling_info.flash_attention_time_fwd:.3f}s({base_profiling_info.fa_num_fwd})') + comp_col.append(f'{comp_profiling_info.flash_attention_time_fwd:.3f}s({comp_profiling_info.fa_num_fwd})') if base_profiling_info.flash_attention_time_bwd or comp_profiling_info.flash_attention_time_bwd: - headers.append('Flash Attention Time(Backward)') - base_col.append(f'{base_profiling_info.flash_attention_time_bwd:.3f}s') - comp_col.append(f'{comp_profiling_info.flash_attention_time_bwd:.3f}s') + headers.append('Flash Attention Time(Backward)(Num)') + base_col.append(f'{base_profiling_info.flash_attention_time_bwd:.3f}s({base_profiling_info.fa_num_bwd})') + comp_col.append(f'{comp_profiling_info.flash_attention_time_bwd:.3f}s({comp_profiling_info.fa_num_bwd})') headers.extend(['Computing Time']) base_col.extend([f'{base_profiling_info.compute_time:.3f}s']) comp_col.extend([f'{comp_profiling_info.compute_time:.3f}s']) -- Gitee