diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md
index d871f019827de5a1fa99368962d0938c54cd4a98..de39ab939eb075df115734cd9a0b34f7468de5ab 100644
--- a/debug/tools/profiling_analyse/README.md
+++ b/debug/tools/profiling_analyse/README.md
@@ -22,13 +22,16 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。
 2、当前仅统计算子运行在vector和cube上的耗时。
 3、这2中算子于csv文件中的的TaskType均为AI_CORE，其中aiv_vec_time时间多表明为vector算子，aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。
+4、算子若无pmu信息，会根据cube算子类型进行区分，当前已知的算子类型为['MatMul', 'BatchMatMul']
+5、用户有添加cube算子的要求，可以使用可选入参'-aop'添加算子名称，使用方式见样例。
 
 ### 通信
-此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的EVENT_WAIT_SQE，对于多个Stream Id的结果，取Stream Id最小值。
-输出结果为该字段时间求和。
+1、此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下同一条流的EVENT_WAIT_SQE总耗时。
+2、选取trace_view中的计算流——即流中同时存在EVENT_WAIT_SQE和Task Type为AI_CORE的流
+3、对于AI_CORE存在2条流中的情况，计算流中累加EVENT_WAIT_SQE时会减去同时间区间内另外流产生的AI_CORE耗时
 
 ### 计算流e2e耗时
-此耗时通过统计trace_view.json中时间戳‘ts’的最小值和最大值，其时间差的绝对值即为e2e耗时。
+此耗时通过统计trace_view.json中compute_time时间戳‘ts’的最小值和最大值，其时间差的绝对值即为e2e耗时。
 
 ### 调度占比
 1、调度占比的求取需先计算调度耗时，调度占比=调度耗时/e2e耗时 * 100%。
@@ -42,5 +45,5 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 ## 样例
 - step1:获取gpu和npu的profiling数据，若采集profiling数据时没开启memory采集开关，则没有内存使用数据
 
-- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2
-- 输出结果：可以得到gpu与npu对照的打屏性能拆解数据
+- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2 -aop op1 op2
+- 输出结果：可以得到gpu与npu对照的打屏性能拆解数据，其中-nlt为输入打屏时间，-aop为手动添加的cube算子类型
diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index 611343027b89643910ec7a93188c20c8f051091a..bd8d8350f2768c783d0183476588d6a94e1eaa52 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -5,73 +5,101 @@ import parser_helper
 
 
 class NpuProfilingParser:
-    def __init__(self, npu_step_time, npu_file_path):
+    def __init__(self, npu_step_time, add_cube_name, npu_file_path):
         self.npu_json_file = npu_file_path.get('trace_view')
         self.npu_summary_file = npu_file_path.get('op_summary')
         self.npu_mem_file = npu_file_path.get('memory_record')
         self.profiling_info = parser_helper.ProfilingInfo()
         self.npu_step_time = npu_step_time
+        self.parallel_time = 0
+        self.aicore_time = 0
+        self.cube_op_type = ['MatMul', 'BatchMatMul']
+        self.cube_op_type = list(set(self.cube_op_type + add_cube_name))
 
     def parse_npu_json_events(self):
-        event_wait_sqe = defaultdict(float)
         if not self.npu_json_file:
             print('Npu trace json file is not available.')
             return
+        compute_time = 0
         min_ts = sys.float_info.max
         max_ts = sys.float_info.min
+        ts_flag = False  # 表明没有获取到compute time的耗时
         data = parser_helper.read_json_file(self.npu_json_file)
+        event_wait_sqe = defaultdict(list)
+        ai_core_dict = defaultdict(list)
+        event_wait_sqe_res = defaultdict(float)
         for dic in data:
-            if dic.get('name') == 'EVENT_WAIT_SQE':
-                args = dic.get('args')
-                stream_id = args.get('Stream Id')
-                event_wait_sqe[stream_id] += dic.get('dur')
-            if dic.get('ts'):
+            self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res)
+            if ('name' in dic) and (dic.get('name') == 'compute_time'):
+                ts_flag = True
                 ts = dic.get('ts')
+                dur = dic.get('dur')
+                compute_time += dur
                 min_ts = ts if ts < min_ts else min_ts
-                max_ts = ts if ts > max_ts else max_ts
-        self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 / 1000
-        self.profiling_info.communication_not_overlapped = event_wait_sqe.get(min(event_wait_sqe)) / 1000 / 1000
+                max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts
+        # AI_CORE和EVENT_WAIT_SQE共存为计算流
+        compute_stream = []
+        parallel_stream = []
+        # 不存在算子并行的情况
+        if len(ai_core_dict) == 1:
+            compute_stream.append(min(ai_core_dict.keys()))
+        elif len(ai_core_dict) == 2:  # 2个ai_core，存在并行流（当前最多2条算子计算流）
+            compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys())
+            parallel_stream = list(ai_core_dict.keys() - set(compute_stream))
+        cs_event_wait_sqe_list = event_wait_sqe[compute_stream[0]]
+        if parallel_stream:
+            cs_ai_core_list = ai_core_dict[parallel_stream[0]]
+            sorted(cs_event_wait_sqe_list, key=lambda x: (x[0]))
+            sorted(cs_ai_core_list, key=lambda x: (x[0]))
+            self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list)
+        self.profiling_info.compute_time = compute_time / 10 ** 6
+        self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 ** 6 if ts_flag else 0
+        self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - 
+            self.parallel_time) / 10 ** 6
         time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \
             self.profiling_info.communication_not_overlapped
         if self.npu_step_time:
             self.profiling_info.scheduling_time = self.npu_step_time - time_required
         else:
             self.profiling_info.scheduling_time = self.profiling_info.e2e_time - time_required
-        self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time
+        self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time \
+            if self.profiling_info.e2e_time != 0 else 0
 
     def parse_npu_csv_events(self):
         if not self.npu_summary_file:
             print('Npu op summary csv file is not available.')
             return
         info = pd.read_csv(self.npu_summary_file, index_col=None)
-        op_statics_result = {}
         cube_time = 0.0
         vec_time = 0.0
-        length = len(info['Model ID'])
+        ai_core_time = 0.0
+        vec_mac_flag = True  # True标记当前summary文件中存在pmu信息
         if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None:
-            raise ValueError('There is no cube time or vector time in the csv!The aic_mac_time(us) and '
-                             'aiv_vec_time(us) are necessary for the determination of cube and vector.')
-
-        for i in range(length):
-            op_type = info.loc[i, 'OP Type']
+            print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:')
+            print(self.cube_op_type)
+            vec_mac_flag = False
+        for i in range(len(info['Model ID'])):
             task_type = info.loc[i, 'Task Type']
+            if task_type not in ['AI_CORE']:
+                continue
             task_durations = info.loc[i, 'Task Duration(us)']
-            aic_mac_time = info.loc[i, 'aic_mac_time(us)']
+            ai_core_time += task_durations
+            op_type = info.loc[i, 'OP Type']
+            if not vec_mac_flag:  # 如果是极简模式根据OP_Type计算完cube time后提前返回
+                cube_time += task_durations if op_type in self.cube_op_type else 0.0
+                continue
             aiv_vec_time = info.loc[i, 'aiv_vec_time(us)']
-
-            if task_type in ['AI_CORE']:
-                if aiv_vec_time > aic_mac_time:
-                    vec_time += task_durations
-                    if op_statics_result.get(op_type) is None:
-                        op_statics_result[op_type] = [task_durations, 'vector']
-                    else:
-                        op_statics_result[op_type][0] += task_durations
-                else:
-                    cube_time += task_durations
-                    if op_statics_result.get(op_type) is None:
-                        op_statics_result[op_type] = [task_durations, 'cube']
-                    else:
-                        op_statics_result[op_type][0] += task_durations
+            if aiv_vec_time > 0:
+                vec_time += task_durations
+        
+        if vec_mac_flag:
+            cube_time = (ai_core_time - vec_time) / 10 ** 6
+            vec_time /= 10 ** 6
+        else:
+            vec_time = (ai_core_time - cube_time) / 10 ** 6
+            cube_time /= 10 ** 6
+        self.profiling_info.cube_time = cube_time
+        self.profiling_info.vector_time = vec_time
         if not self.npu_mem_file:
             print('Npu op memory csv file is not available.')
             return
@@ -81,5 +109,34 @@ class NpuProfilingParser:
             print('Npu profiling data does not contain memory info.')
         else:
             self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024
-        self.profiling_info.cube_time = cube_time / 10 ** 6
-        self.profiling_info.vector_time = vec_time / 10 ** 6
+
+    @staticmethod
+    def interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list):
+        ans = 0
+        i = 0
+        j = 0
+        while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list):
+            lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0])
+            hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1])
+            if lo <= hi:
+                ans += (hi - lo)
+            if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]:
+                i += 1
+            else:
+                j += 1
+        return ans
+
+    @staticmethod
+    def get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, res):
+        if not dic.get('args'):
+            return
+        args = dic.get('args')
+        if args.get('Stream Id'):
+            stream_id = args.get('Stream Id')
+            ts = dic.get('ts')
+            dur = dic.get('dur')
+            if args.get('Task Type') == 'EVENT_WAIT_SQE':
+                res[stream_id] += dur
+                event_wait_sqe[stream_id].append([ts, ts + dur])
+            elif args.get('Task Type') == 'AI_CORE':
+                ai_core_dict[stream_id].append([ts, ts + dur])
diff --git a/debug/tools/profiling_analyse/parser_helper.py b/debug/tools/profiling_analyse/parser_helper.py
index 1477db6bcee6a3edf98a8fb8f7494f1f98556bb9..6b91fdd5342c91e8ea251e0232e08bfc44012593 100644
--- a/debug/tools/profiling_analyse/parser_helper.py
+++ b/debug/tools/profiling_analyse/parser_helper.py
@@ -6,7 +6,7 @@ class ProfilingInfo:
     def __init__(self):
         self.cube_time = 0.0
         self.vector_time = 0.0
-        self.large_kernel = 0.0
+        self.compute_time = 0.0
         self.communication_not_overlapped = 0.0
         self.scheduling_ratio = 0.0
         self.memory_used = 0.0
diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py
index f3e891473728008dd0f070f080451b1c61ef8244..2f773495344e4f29a4d691d6818abf9b30e31a34 100644
--- a/debug/tools/profiling_analyse/profiling_parse.py
+++ b/debug/tools/profiling_analyse/profiling_parse.py
@@ -16,20 +16,21 @@ def parse_command():
                         help='Npu single core profiling root path.')
     parser.add_argument('-nlt', '--npu_log_time', required=False, default=0.0, metavar='(FILE)', type=float, 
                         help='Npu one step time(s).')
+    parser.add_argument('-aop', '--add_cube_op', required=False, default=[], nargs='*', help='add cube op name')
     return parser.parse_args()
 
 
 def show_table(gpu_profiling_info, npu_profiling_info):
     table = PrettyTable()
     table.title = '大模型性能拆解'
-    table.field_names = ['', 'cube算子', 'vector算子', '大kernel算子', '通信', '调度耗时', '调度占比', '内存',
+    table.field_names = ['', 'cube算子', 'vector算子', '计算流耗时', '通信', '调度耗时', '调度占比', '内存',
                          'E2E性能值']
     table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s',
-                  f'{gpu_profiling_info.large_kernel:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s',
+                  f'{gpu_profiling_info.compute_time:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s',
                   f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}',
                   f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s'])
     table.add_row(['当前现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s',
-                  f'{npu_profiling_info.large_kernel:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s',
+                  f'{npu_profiling_info.compute_time:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s',
                   f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}',
                   f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s'])
     print(table)
@@ -47,7 +48,7 @@ def parse_gpu(args):
 
 
 def parse_npu(args, npu_path):
-    npu_parser = NpuProfilingParser(args.npu_log_time, npu_path)
+    npu_parser = NpuProfilingParser(args.npu_log_time, args.add_cube_op, npu_path)
     npu_parser.parse_npu_csv_events()
     npu_parser.parse_npu_json_events()
     return npu_parser.profiling_info