diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py
index 449bc7514fb7d15571b44a03ff1f49b535dd3aef..fee461e200ca517d09ba0384ec13be20350a624d 100644
--- a/profiler/compare_tools/profiling_analysis/gpu_parser.py
+++ b/profiler/compare_tools/profiling_analysis/gpu_parser.py
@@ -40,7 +40,8 @@ class GpuProfilingParser:
     def parse_events(self):
         cube_time = 0.0
         all_op_time = 0.0
-        fa_time = 0.0
+        fa_time_bwd = 0.0
+        fa_time_fwd = 0.0
         cube_num = 0
         vec_num = 0
         op_list = []
@@ -68,7 +69,10 @@ class GpuProfilingParser:
                 for timestep in range(ts + 1, ts + dur + 1):
                     marks[str(timestep)] += -100  # mark this timestep in compute stream
             if self.is_flash_attention(name):
-                fa_time += float(dur)
+                if 'bwd' in name.lower():
+                    fa_time_bwd += float(dur)
+                else:
+                    fa_time_fwd += float(dur)
             elif self.CUBE_MARK in name.lower():
                 cube_num += 1
                 cube_time += float(dur)
@@ -80,9 +84,10 @@ class GpuProfilingParser:
         op_dataframe.to_csv('gpu_perf.csv', index=False)
         self.profiling_info.compute_time = len([_ for _, value in marks.items() if value < 0]) / 10 ** 6
         self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6
-        self.profiling_info.flash_attention_time = fa_time / 10 ** 6
+        self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6
+        self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6
         self.profiling_info.cube_time = cube_time / 10 ** 6
-        self.profiling_info.vec_time = (all_op_time - cube_time - fa_time) / 10 ** 6
+        self.profiling_info.vec_time = (all_op_time - cube_time - fa_time_bwd - fa_time_fwd) / 10 ** 6
         self.profiling_info.cube_num = cube_num
         self.profiling_info.vec_num = vec_num
         self.parse_e2e_time()
diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py
index 34a191e44e5acfcd151d7d3c9d9659d3d4718f85..f5f5e60285215abbd43cd4603f196fcea0a0daca 100644
--- a/profiler/compare_tools/profiling_analysis/npu_parser.py
+++ b/profiler/compare_tools/profiling_analysis/npu_parser.py
@@ -119,7 +119,8 @@ class NpuProfilingParser:
         info = pd.read_csv(self.npu_summary_file, index_col=None)
         cube_time = 0.0
         vec_time = 0.0
-        fa_time = 0.0
+        fa_time_fwd = 0.0
+        fa_time_bwd = 0.0
         cube_num = 0
         vec_num = 0
         if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None:
@@ -132,7 +133,10 @@ class NpuProfilingParser:
                 continue
             task_durations = info.loc[i, 'Duration(us)']
             if self.FLASH_ATTENTION in op_type.lower():
-                fa_time += task_durations
+                if 'bwd' or 'grad' in op_type.lower():
+                    fa_time_bwd += task_durations
+                else:
+                    fa_time_fwd += task_durations
             elif aiv_vec_time > 0:
                 vec_time += task_durations
                 vec_num += 1
@@ -141,7 +145,8 @@ class NpuProfilingParser:
                 cube_num += 1
         self.profiling_info.cube_time = cube_time / 10 ** 6
         self.profiling_info.vec_time = vec_time / 10 ** 6
-        self.profiling_info.flash_attention_time = fa_time / 10 ** 6
+        self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6
+        self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6
         self.profiling_info.cube_num = cube_num
         self.profiling_info.vec_num = vec_num
 
diff --git a/profiler/compare_tools/profiling_analysis/parser_helper.py b/profiler/compare_tools/profiling_analysis/parser_helper.py
index 5fe5f1770c7a97ce27d5c488e6b3b84cab93b289..f46095ed0cada49034cea0469aa837a5d3e1c049 100644
--- a/profiler/compare_tools/profiling_analysis/parser_helper.py
+++ b/profiler/compare_tools/profiling_analysis/parser_helper.py
@@ -30,6 +30,7 @@ class ProfilingInfo:
         self.memory_used = 0.0
         self.e2e_time = 0.0
         self.scheduling_time = 0.0
-        self.flash_attention_time = 0.0
+        self.flash_attention_time_bwd = 0.0
+        self.flash_attention_time_fwd = 0.0
         self.minimal_profiling = False
         self.hide_op_details = False
diff --git a/profiler/compare_tools/profiling_analysis/profiling_parse.py b/profiler/compare_tools/profiling_analysis/profiling_parse.py
index 17b1cb30681a1c7e9b422169b127ce88ac8cd693..70b435f7a6d79e6845a3cc4c660d881364979b40 100644
--- a/profiler/compare_tools/profiling_analysis/profiling_parse.py
+++ b/profiler/compare_tools/profiling_analysis/profiling_parse.py
@@ -35,10 +35,14 @@ def generate_table_info(base_profiling_info, comp_profiling_info, table):
                          f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})'])
         comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})',
                          f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})'])
-    if base_profiling_info.flash_attention_time or comp_profiling_info.flash_attention_time:
-        headers.append('Flash Attention Time')
-        base_col.append(f'{base_profiling_info.flash_attention_time:.3f}s')
-        comp_col.append(f'{comp_profiling_info.flash_attention_time:.3f}s')
+    if base_profiling_info.flash_attention_time_fwd or comp_profiling_info.flash_attention_time_fwd:
+        headers.append('Flash Attention Time(Forward)')
+        base_col.append(f'{base_profiling_info.flash_attention_time_fwd:.3f}s')
+        comp_col.append(f'{comp_profiling_info.flash_attention_time_fwd:.3f}s')
+    if base_profiling_info.flash_attention_time_bwd or comp_profiling_info.flash_attention_time_bwd:
+        headers.append('Flash Attention Time(Backward)')
+        base_col.append(f'{base_profiling_info.flash_attention_time_bwd:.3f}s')
+        comp_col.append(f'{comp_profiling_info.flash_attention_time_bwd:.3f}s')
     headers.extend(['Computing Time'])
     base_col.extend([f'{base_profiling_info.compute_time:.3f}s'])
     comp_col.extend([f'{comp_profiling_info.compute_time:.3f}s'])