diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py
index 4443562bd4edae71d30c0314ea22756a8d20b534..a9c8466725ec66a16da836acb9d29029713f9492 100644
--- a/profiler/compare_tools/profiling_analysis/gpu_parser.py
+++ b/profiler/compare_tools/profiling_analysis/gpu_parser.py
@@ -26,26 +26,33 @@ class OpTimeWarper:
     def __init__(
         self,
         cube_time: float = 0.0,
+        sdma_time: float = 0.0,
+        vec_time: float = 0.0,
         fa_time_fwd: float = 0.0,
         fa_time_bwd: float = 0.0,
         all_op_time: float = 0.0,
         compute_stream_dur: float = 0.0,
         cube_num: int = 0,
-        vec_num: int = 0
+        vec_num: int = 0,
+        sdma_num: int = 0
     ):
         self.cube_time = cube_time
+        self.sdma_time = sdma_time
+        self.vec_time = vec_time
         self.fa_time_fwd = fa_time_fwd
         self.fa_time_bwd = fa_time_bwd
         self.all_op_time = all_op_time
         self.compute_stream_dur = compute_stream_dur
         self.cube_num = cube_num
         self.vec_num = vec_num
+        self.sdma_num = sdma_num
 
 
 class GpuProfilingParser:
     NCCL_MARK = 'nccl'
     CUBE_MARK = 'gemm'
     FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel']]
+    SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)']
 
     def __init__(self, gpu_path):
         self.trace_events = FileReader.read_trace_file(gpu_path).get('traceEvents')
@@ -59,13 +66,22 @@ class GpuProfilingParser:
                 return True
         return False
 
+    def is_sdma_time(self, name: str):
+        for mark in self.SDMA_MARK_LIST:
+            if mark in name.lower():
+                return True
+        return False
+
     def update_op_list(self, op_list, marks):
         cube_time = 0.0
         all_op_time = 0.0
         fa_time_bwd = 0.0
         fa_time_fwd = 0.0
+        sdma_time = 0.0
+        vec_time = 0.0
         cube_num = 0
         vec_num = 0
+        sdma_num = 0
         compute_stream_dur = 0.0
         for event in self.trace_events:
             if not isinstance(event, dict):
@@ -78,6 +94,9 @@ class GpuProfilingParser:
             dur = event.get('dur')
             ts = event.get('ts')
             cat = event.get('cat', '')
+            if self.is_sdma_time(name):
+                sdma_time += float(dur)
+                sdma_num += 1
             if cat.lower() != 'kernel':
                 continue
             if self.NCCL_MARK in name.lower():
@@ -97,16 +116,20 @@ class GpuProfilingParser:
                 cube_time += float(dur)
             else:
                 vec_num += 1
+                vec_time += float(dur)
             all_op_time += float(dur)
             op_list.append([ts, name, cat, dur])
         time_wrapper = OpTimeWarper(
             cube_time=cube_time,
+            sdma_time=sdma_time,
+            vec_time=vec_time,
             fa_time_fwd=fa_time_fwd,
             fa_time_bwd=fa_time_bwd,
             all_op_time=all_op_time,
             compute_stream_dur=compute_stream_dur,
             cube_num=cube_num,
-            vec_num=vec_num
+            vec_num=vec_num,
+            sdma_num=sdma_num
         )
         return time_wrapper
 
@@ -122,15 +145,20 @@ class GpuProfilingParser:
         compute_stream_dur = time_wrapper.compute_stream_dur
         cube_num = time_wrapper.cube_num
         vec_num = time_wrapper.vec_num
+        sdma_num = time_wrapper.sdma_num
+        sdma_time = time_wrapper.sdma_time
+        vec_time = time_wrapper.vec_time
 
         self.profiling_info.compute_time = len([_ for _, value in marks.items() if value < 0]) / 10 ** 6
         self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6
         self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6
         self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6
         self.profiling_info.cube_time = cube_time / 10 ** 6
-        self.profiling_info.vec_time = (all_op_time - cube_time - fa_time_fwd - fa_time_bwd) / 10 ** 6
+        self.profiling_info.vec_time = self.profiling_info.compute_time - (cube_time + fa_time_fwd + fa_time_bwd) / 10 ** 6
         self.profiling_info.cube_num = cube_num
         self.profiling_info.vec_num = vec_num
+        self.profiling_info.sdma_num = sdma_num
+        self.profiling_info.sdma_time = sdma_time / 10 ** 6
         self.parse_e2e_time()
 
         self.profiling_info.scheduling_time = self.profiling_info.e2e_time - all_op_time / 10 ** 6 - \
diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py
index 2c71b0dc4a5e10f5f40f70da618dd017cfc461f7..8c9b3e75c0546d387fc862731a2b47163df58ce8 100644
--- a/profiler/compare_tools/profiling_analysis/npu_parser.py
+++ b/profiler/compare_tools/profiling_analysis/npu_parser.py
@@ -27,6 +27,8 @@ class NpuInfoWrapper:
         self,
         compute_time: int,
         communication_time: int,
+        sdma_time: int,
+        sdma_num: int,
         is_cluster: bool,
         event_wait_sqe: dict,
         ai_core_dict: dict,
@@ -35,6 +37,8 @@ class NpuInfoWrapper:
     ):
         self.compute_time = compute_time
         self.communication_time = communication_time
+        self.sdma_time = sdma_time
+        self.sdma_num = sdma_num
         self.is_cluster = is_cluster
         self.event_wait_sqe = event_wait_sqe
         self.ai_core_dict = ai_core_dict
@@ -56,6 +60,30 @@ class NpuProfilingParser:
         self.aicore_time = 0
         self.min_stream_ts = sys.float_info.max
         self.max_stream_ts = sys.float_info.min
+        self.sdma_sqe = defaultdict(float)
+        self.sdma_num_cnt = defaultdict(int)
+
+    def get_sdma_para(self, sdma_sqe, sdma_num_cnt, ai_core_dict, event_wait_sqe) -> (float, int):
+        compute_stream = []
+        parallel_stream = []
+        sdma_time = 0.0
+        sdma_parallel_time = 0.0
+        sdma_num = 0
+        sdma_parallel_num = 0
+        if len(ai_core_dict) == 1:
+            compute_stream.append(min(ai_core_dict.keys()))
+        elif len(ai_core_dict) == 2:  # 2个ai_core，存在并行流（当前最多2条算子计算流）
+            compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys())
+            parallel_stream = list(ai_core_dict.keys() - set(compute_stream))
+        else:
+            print('[WARNING] Npu Compute Stream Num Error.')
+        if parallel_stream:
+            sdma_parallel_time = sdma_sqe[parallel_stream[0]]
+            sdma_parallel_num = sdma_num_cnt[parallel_stream[0]]
+        if compute_stream:
+            sdma_time = sdma_sqe[compute_stream[0]] + sdma_parallel_time
+            sdma_num = sdma_num_cnt[compute_stream[0]] + sdma_parallel_num
+        return sdma_time, sdma_num
 
     def parse_npu_json_events(self):
         if not self.npu_json_file:
@@ -87,8 +115,9 @@ class NpuProfilingParser:
                 communication_time += dur
                 min_ts = ts if ts < min_ts else min_ts
                 max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts
+        sdma_time, sdma_num = self.get_sdma_para(self.sdma_sqe, self.sdma_num_cnt, ai_core_dict, event_wait_sqe)
         npu_info_wrapper = NpuInfoWrapper(
-            compute_time, communication_time, is_cluster,
+            compute_time, communication_time, sdma_time, sdma_num, is_cluster,
             event_wait_sqe, ai_core_dict, event_wait_sqe_res, ai_core_res)
         self.update_npu_info(max_ts - min_ts, npu_info_wrapper)
 
@@ -100,6 +129,8 @@ class NpuProfilingParser:
         ai_core_dict = npu_info_wrapper.ai_core_dict
         event_wait_sqe_res = npu_info_wrapper.event_wait_sqe_res
         ai_core_res = npu_info_wrapper.ai_core_res
+        sdma_time = npu_info_wrapper.sdma_time
+        sdma_num = npu_info_wrapper.sdma_num
         # AI_CORE和EVENT_WAIT_SQE共存为计算流
         compute_stream = []
         parallel_stream = []
@@ -121,11 +152,16 @@ class NpuProfilingParser:
                 self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list)
         self.profiling_info.compute_time = compute_time / 10 ** 6 if is_cluster else \
             ai_core_res[compute_stream[0]] / 10 ** 6
+        self.profiling_info.other_time = self.profiling_info.compute_time - self.profiling_info.cube_time - \
+            self.profiling_info.flash_attention_time_fwd - self.profiling_info.flash_attention_time_bwd - \
+            self.profiling_info.vec_time
         self.profiling_info.e2e_time = ts_dur / 10 ** 6 if is_cluster else \
             (self.max_stream_ts - self.min_stream_ts) / 10 ** 6
         self.profiling_info.communication_not_overlapped = communication_time / 10 ** 6 \
             if is_cluster else (event_wait_sqe_res[compute_stream[0]] - self.parallel_time) / 10 ** 6
         time_required = self.profiling_info.compute_time + self.profiling_info.communication_not_overlapped
+        self.profiling_info.sdma_time = sdma_time / 10 ** 6
+        self.profiling_info.sdma_num = sdma_num
         if self.npu_step_time:
             self.profiling_info.scheduling_time = self.npu_step_time - time_required
         else:
@@ -226,6 +262,9 @@ class NpuProfilingParser:
             if args.get('Task Type') == 'EVENT_WAIT_SQE':
                 enent_wait_res[stream_id] += dur
                 event_wait_sqe[stream_id].append([ts, ts + dur])
+            elif args.get('Task Type') in ('SDMA_SQE', 'PCIE_DMA_SQE'):
+                self.sdma_sqe[stream_id] += dur
+                self.sdma_num_cnt[stream_id] += 1
             elif args.get('Task Type') in ('AI_CORE', 'MIX_AIC', 'MIX_AIV', 'AI_CPU', 'AI_VECTOR_CORE', 'FFTS_PLUS'):
                 ai_core_res[stream_id] += dur
                 ai_core_dict[stream_id].append([ts, ts + dur])
diff --git a/profiler/compare_tools/profiling_analysis/parser_helper.py b/profiler/compare_tools/profiling_analysis/parser_helper.py
index f46095ed0cada49034cea0469aa837a5d3e1c049..3b42d9308ce823a527168bd831946557d9f310f9 100644
--- a/profiler/compare_tools/profiling_analysis/parser_helper.py
+++ b/profiler/compare_tools/profiling_analysis/parser_helper.py
@@ -21,6 +21,7 @@ class ProfilingInfo:
     def __init__(self, profiling_type: str):
         self.profiling_type = profiling_type
         self.cube_time = 0.0
+        self.other_time = 0.0
         self.vec_time = 0.0
         self.cube_num = 0
         self.vec_num = 0
@@ -29,6 +30,7 @@ class ProfilingInfo:
         self.scheduling_ratio = 0.0
         self.memory_used = 0.0
         self.e2e_time = 0.0
+        self.sdma_time = 0.0
         self.scheduling_time = 0.0
         self.flash_attention_time_bwd = 0.0
         self.flash_attention_time_fwd = 0.0
diff --git a/profiler/compare_tools/profiling_analysis/profiling_parse.py b/profiler/compare_tools/profiling_analysis/profiling_parse.py
index 70b435f7a6d79e6845a3cc4c660d881364979b40..434fba8309e95112e17ebdb9c92714a3ba545053 100644
--- a/profiler/compare_tools/profiling_analysis/profiling_parse.py
+++ b/profiler/compare_tools/profiling_analysis/profiling_parse.py
@@ -35,6 +35,10 @@ def generate_table_info(base_profiling_info, comp_profiling_info, table):
                          f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})'])
         comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})',
                          f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})'])
+    if base_profiling_info.other_time or comp_profiling_info.other_time:
+        headers.append('Other Time')
+        base_col.append(f'{base_profiling_info.other_time:.3f}s')
+        comp_col.append(f'{comp_profiling_info.other_time:.3f}s')
     if base_profiling_info.flash_attention_time_fwd or comp_profiling_info.flash_attention_time_fwd:
         headers.append('Flash Attention Time(Forward)')
         base_col.append(f'{base_profiling_info.flash_attention_time_fwd:.3f}s')
@@ -50,18 +54,26 @@ def generate_table_info(base_profiling_info, comp_profiling_info, table):
         headers.append('Mem Usage')
         base_col.append(f'{base_profiling_info.memory_used:.2f}G')
         comp_col.append(f'{comp_profiling_info.memory_used:.2f}G')
+    headers.extend(['Uncovered Communication Time'])
+    base_col.extend(
+        [f'{base_profiling_info.communication_not_overlapped: .3f}s'])
+    comp_col.extend(
+        [f'{comp_profiling_info.communication_not_overlapped: .3f}s'])
+    if base_profiling_info.sdma_time or comp_profiling_info.sdma_time:
+        headers.append('SDMA Time(Num)')
+        base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})')
+        comp_col.append(f'{comp_profiling_info.sdma_time:.3f}s({comp_profiling_info.sdma_num})')
     cue = ''
     if ((base_profiling_info.profiling_type == "NPU" and not base_profiling_info.minimal_profiling) or
             (comp_profiling_info.profiling_type == "NPU" and not comp_profiling_info.minimal_profiling)):
+            
         cue = '(Not minimal profiling)'
 
-    headers.extend(['Uncovered Communication Time', 'Free Time', 'E2E Time' + cue])
+    headers.extend(['Free Time', 'E2E Time' + cue])
     base_col.extend(
-        [f'{base_profiling_info.communication_not_overlapped: .3f}s', f'{base_profiling_info.scheduling_time:.3f}s',
-         f'{base_profiling_info.e2e_time:.3f}s'])
+        [f'{base_profiling_info.scheduling_time:.3f}s', f'{base_profiling_info.e2e_time:.3f}s'])
     comp_col.extend(
-        [f'{comp_profiling_info.communication_not_overlapped: .3f}s', f'{comp_profiling_info.scheduling_time:.3f}s',
-         f'{comp_profiling_info.e2e_time:.3f}s'])
+        [f'{comp_profiling_info.scheduling_time:.3f}s', f'{comp_profiling_info.e2e_time:.3f}s'])
     table.field_names = headers
     table.add_row(base_col)
     table.add_row(comp_col)