diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py index 449bc7514fb7d15571b44a03ff1f49b535dd3aef..fee461e200ca517d09ba0384ec13be20350a624d 100644 --- a/profiler/compare_tools/profiling_analysis/gpu_parser.py +++ b/profiler/compare_tools/profiling_analysis/gpu_parser.py @@ -40,7 +40,8 @@ class GpuProfilingParser: def parse_events(self): cube_time = 0.0 all_op_time = 0.0 - fa_time = 0.0 + fa_time_bwd = 0.0 + fa_time_fwd = 0.0 cube_num = 0 vec_num = 0 op_list = [] @@ -68,7 +69,10 @@ class GpuProfilingParser: for timestep in range(ts + 1, ts + dur + 1): marks[str(timestep)] += -100 # mark this timestep in compute stream if self.is_flash_attention(name): - fa_time += float(dur) + if 'bwd' in name.lower(): + fa_time_bwd += float(dur) + else: + fa_time_fwd += float(dur) elif self.CUBE_MARK in name.lower(): cube_num += 1 cube_time += float(dur) @@ -80,9 +84,10 @@ class GpuProfilingParser: op_dataframe.to_csv('gpu_perf.csv', index=False) self.profiling_info.compute_time = len([_ for _, value in marks.items() if value < 0]) / 10 ** 6 self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6 - self.profiling_info.flash_attention_time = fa_time / 10 ** 6 + self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6 + self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6 self.profiling_info.cube_time = cube_time / 10 ** 6 - self.profiling_info.vec_time = (all_op_time - cube_time - fa_time) / 10 ** 6 + self.profiling_info.vec_time = (all_op_time - cube_time - fa_time_bwd - fa_time_fwd) / 10 ** 6 self.profiling_info.cube_num = cube_num self.profiling_info.vec_num = vec_num self.parse_e2e_time() diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py index 34a191e44e5acfcd151d7d3c9d9659d3d4718f85..f5f5e60285215abbd43cd4603f196fcea0a0daca 100644 --- a/profiler/compare_tools/profiling_analysis/npu_parser.py +++ b/profiler/compare_tools/profiling_analysis/npu_parser.py @@ -119,7 +119,8 @@ class NpuProfilingParser: info = pd.read_csv(self.npu_summary_file, index_col=None) cube_time = 0.0 vec_time = 0.0 - fa_time = 0.0 + fa_time_fwd = 0.0 + fa_time_bwd = 0.0 cube_num = 0 vec_num = 0 if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None: @@ -132,7 +133,10 @@ class NpuProfilingParser: continue task_durations = info.loc[i, 'Duration(us)'] if self.FLASH_ATTENTION in op_type.lower(): - fa_time += task_durations + if 'bwd' or 'grad' in op_type.lower(): + fa_time_bwd += task_durations + else: + fa_time_fwd += task_durations elif aiv_vec_time > 0: vec_time += task_durations vec_num += 1 @@ -141,7 +145,8 @@ class NpuProfilingParser: cube_num += 1 self.profiling_info.cube_time = cube_time / 10 ** 6 self.profiling_info.vec_time = vec_time / 10 ** 6 - self.profiling_info.flash_attention_time = fa_time / 10 ** 6 + self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6 + self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6 self.profiling_info.cube_num = cube_num self.profiling_info.vec_num = vec_num diff --git a/profiler/compare_tools/profiling_analysis/parser_helper.py b/profiler/compare_tools/profiling_analysis/parser_helper.py index 5fe5f1770c7a97ce27d5c488e6b3b84cab93b289..f46095ed0cada49034cea0469aa837a5d3e1c049 100644 --- a/profiler/compare_tools/profiling_analysis/parser_helper.py +++ b/profiler/compare_tools/profiling_analysis/parser_helper.py @@ -30,6 +30,7 @@ class ProfilingInfo: self.memory_used = 0.0 self.e2e_time = 0.0 self.scheduling_time = 0.0 - self.flash_attention_time = 0.0 + self.flash_attention_time_bwd = 0.0 + self.flash_attention_time_fwd = 0.0 self.minimal_profiling = False self.hide_op_details = False diff --git a/profiler/compare_tools/profiling_analysis/profiling_parse.py b/profiler/compare_tools/profiling_analysis/profiling_parse.py index 17b1cb30681a1c7e9b422169b127ce88ac8cd693..70b435f7a6d79e6845a3cc4c660d881364979b40 100644 --- a/profiler/compare_tools/profiling_analysis/profiling_parse.py +++ b/profiler/compare_tools/profiling_analysis/profiling_parse.py @@ -35,10 +35,14 @@ def generate_table_info(base_profiling_info, comp_profiling_info, table): f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})']) comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})', f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})']) - if base_profiling_info.flash_attention_time or comp_profiling_info.flash_attention_time: - headers.append('Flash Attention Time') - base_col.append(f'{base_profiling_info.flash_attention_time:.3f}s') - comp_col.append(f'{comp_profiling_info.flash_attention_time:.3f}s') + if base_profiling_info.flash_attention_time_fwd or comp_profiling_info.flash_attention_time_fwd: + headers.append('Flash Attention Time(Forward)') + base_col.append(f'{base_profiling_info.flash_attention_time_fwd:.3f}s') + comp_col.append(f'{comp_profiling_info.flash_attention_time_fwd:.3f}s') + if base_profiling_info.flash_attention_time_bwd or comp_profiling_info.flash_attention_time_bwd: + headers.append('Flash Attention Time(Backward)') + base_col.append(f'{base_profiling_info.flash_attention_time_bwd:.3f}s') + comp_col.append(f'{comp_profiling_info.flash_attention_time_bwd:.3f}s') headers.extend(['Computing Time']) base_col.extend([f'{base_profiling_info.compute_time:.3f}s']) comp_col.extend([f'{comp_profiling_info.compute_time:.3f}s'])