From 7ddbf345105ca9044fa81b123ac00cd1090dc283 Mon Sep 17 00:00:00 2001
From: menff <menfeifei@huawei.com>
Date: Mon, 10 Jul 2023 16:58:50 +0800
Subject: [PATCH 1/7] =?UTF-8?q?=E5=AE=8C=E5=96=84=E5=BC=82=E5=B8=B8?=
 =?UTF-8?q?=E6=83=85=E5=86=B5=EF=BC=8C=E4=BF=AE=E6=AD=A3=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E8=AE=A1=E7=AE=97=E6=96=B9=E5=BC=8F=EF=BC=8C=E5=A2=9E=E5=8A=A0?=
 =?UTF-8?q?=E5=B9=B6=E8=A1=8C=E8=AE=A1=E7=AE=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 debug/tools/profiling_analyse/npu_parser.py   | 110 +++++++++++++-----
 .../tools/profiling_analyse/parser_helper.py  |   2 +-
 .../profiling_analyse/profiling_parse.py      |   6 +-
 3 files changed, 83 insertions(+), 35 deletions(-)

diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index 611343027..13ab4c9cc 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -11,26 +11,46 @@ class NpuProfilingParser:
         self.npu_mem_file = npu_file_path.get('memory_record')
         self.profiling_info = parser_helper.ProfilingInfo()
         self.npu_step_time = npu_step_time
+        self.parallel_time = 0
+        self.aicore_time = 0
 
     def parse_npu_json_events(self):
-        event_wait_sqe = defaultdict(float)
         if not self.npu_json_file:
             print('Npu trace json file is not available.')
             return
+        compute_time = 0
         min_ts = sys.float_info.max
         max_ts = sys.float_info.min
         data = parser_helper.read_json_file(self.npu_json_file)
+        event_wait_sqe = defaultdict(list)
+        ai_core_dict = defaultdict(list)
+        event_wait_sqe_res = defaultdict(float)
         for dic in data:
-            if dic.get('name') == 'EVENT_WAIT_SQE':
-                args = dic.get('args')
-                stream_id = args.get('Stream Id')
-                event_wait_sqe[stream_id] += dic.get('dur')
-            if dic.get('ts'):
+            self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res)
+            if ('name' in dic) and (dic.get('name') == 'compute_time'):
+                compute_time += dic.get('dur')
                 ts = dic.get('ts')
                 min_ts = ts if ts < min_ts else min_ts
                 max_ts = ts if ts > max_ts else max_ts
+        # AI_CORE和EVENT_WAIT_SQE共存为计算流
+        compute_stream = []
+        parallel_stream = []
+        # 不存在算子并行的情况
+        if len(ai_core_dict) == 1:
+            compute_stream.appen(min(ai_core_dict.keys()))
+        elif len(ai_core_dict) == 2:  # 2个ai_core，存在并行流（当前最多2条算子计算流）
+            compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys())
+            parallel_stream = list(ai_core_dict.keys() - set(compute_stream))
+        cs_event_wait_sqe_list = event_wait_sqe[compute_stream[0]]
+        if parallel_stream:
+            cs_ai_core_list = ai_core_dict[parallel_stream[0]]
+            sorted(cs_event_wait_sqe_list, key=lambda x: (x[0]))
+            sorted(cs_ai_core_list, key=lambda x: (x[0]))
+            self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list)
+        self.profiling.compute_time = compute_time / 10 ** 6
         self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 / 1000
-        self.profiling_info.communication_not_overlapped = event_wait_sqe.get(min(event_wait_sqe)) / 1000 / 1000
+        self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - 
+            self.parallel_time) / 10 ** 6
         time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \
             self.profiling_info.communication_not_overlapped
         if self.npu_step_time:
@@ -44,34 +64,32 @@ class NpuProfilingParser:
             print('Npu op summary csv file is not available.')
             return
         info = pd.read_csv(self.npu_summary_file, index_col=None)
-        op_statics_result = {}
         cube_time = 0.0
         vec_time = 0.0
-        length = len(info['Model ID'])
+        ai_core_time = 0.0
+        vec_mac_flag = True  # True标记当前summary文件中存在pmu信息
         if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None:
-            raise ValueError('There is no cube time or vector time in the csv!The aic_mac_time(us) and '
-                             'aiv_vec_time(us) are necessary for the determination of cube and vector.')
-
-        for i in range(length):
-            op_type = info.loc[i, 'OP Type']
+            print('当前的profiling结果可能是极简模式,无法区分cube和vector,总的ai_core耗时会展示在vector算子列')
+            vec_mac_flag = False
+        for i in range(len(info['Model ID'])):
             task_type = info.loc[i, 'Task Type']
+            if task_type not in ['AI_CORE']:
+                continue
             task_durations = info.loc[i, 'Task Duration(us)']
-            aic_mac_time = info.loc[i, 'aic_mac_time(us)']
+            ai_core_time += task_durations
+            if not vec_mac_flag:
+                continue
             aiv_vec_time = info.loc[i, 'aiv_vec_time(us)']
-
-            if task_type in ['AI_CORE']:
-                if aiv_vec_time > aic_mac_time:
-                    vec_time += task_durations
-                    if op_statics_result.get(op_type) is None:
-                        op_statics_result[op_type] = [task_durations, 'vector']
-                    else:
-                        op_statics_result[op_type][0] += task_durations
-                else:
-                    cube_time += task_durations
-                    if op_statics_result.get(op_type) is None:
-                        op_statics_result[op_type] = [task_durations, 'cube']
-                    else:
-                        op_statics_result[op_type][0] += task_durations
+            if aiv_vec_time > 0:
+                vec_time += task_durations
+        
+        if vec_mac_flag:
+            cube_time = (ai_core_time - vec_time) / 10 ** 6
+            vec_time /= 10 ** 6
+        else:
+            vec_time = ai_core_time / 10 ** 6
+        self.profiling_info.cube_time = cube_time
+        self.profiling_info.vector_time = vec_time
         if not self.npu_mem_file:
             print('Npu op memory csv file is not available.')
             return
@@ -81,5 +99,35 @@ class NpuProfilingParser:
             print('Npu profiling data does not contain memory info.')
         else:
             self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024
-        self.profiling_info.cube_time = cube_time / 10 ** 6
-        self.profiling_info.vector_time = vec_time / 10 ** 6
+
+    @staticmethod
+    def interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list):
+        ans = 0
+        i = 0
+        j = 0
+        while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list):
+            lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0])
+            hi = max(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1])
+            if lo < hi:
+                ans += (hi - lo)
+            if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]:
+                i += 1
+            else:
+                j += 1
+        return ans
+
+
+    @staticmethod
+    def get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, res):
+        if not dic.get('args'):
+            return
+        args = dic.get('args')
+        if args.get('Stream Id'):
+            stream_id = args.get('Stream Id')
+            ts = dic.get('ts')
+            dur = dic.get('dur')
+            if args.get('Task Type') == 'EVENT_WAIT_SQE':
+                res[stream_id] += dur
+                event_wait_sqe[stream_id].append([ts, ts + dur])
+            elif:
+                ai_core_dict[stream_id].append([ts, ts + dur])
\ No newline at end of file
diff --git a/debug/tools/profiling_analyse/parser_helper.py b/debug/tools/profiling_analyse/parser_helper.py
index 1477db6bc..6b91fdd53 100644
--- a/debug/tools/profiling_analyse/parser_helper.py
+++ b/debug/tools/profiling_analyse/parser_helper.py
@@ -6,7 +6,7 @@ class ProfilingInfo:
     def __init__(self):
         self.cube_time = 0.0
         self.vector_time = 0.0
-        self.large_kernel = 0.0
+        self.compute_time = 0.0
         self.communication_not_overlapped = 0.0
         self.scheduling_ratio = 0.0
         self.memory_used = 0.0
diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py
index f3e891473..b5dc9c385 100644
--- a/debug/tools/profiling_analyse/profiling_parse.py
+++ b/debug/tools/profiling_analyse/profiling_parse.py
@@ -22,14 +22,14 @@ def parse_command():
 def show_table(gpu_profiling_info, npu_profiling_info):
     table = PrettyTable()
     table.title = '大模型性能拆解'
-    table.field_names = ['', 'cube算子', 'vector算子', '大kernel算子', '通信', '调度耗时', '调度占比', '内存',
+    table.field_names = ['', 'cube算子', 'vector算子', '计算流耗时', '通信', '调度耗时', '调度占比', '内存',
                          'E2E性能值']
     table.add_row(['GPU基线', f'{gpu_profiling_info.cube_time:.3f}s', f'{gpu_profiling_info.vector_time:.3f}s',
-                  f'{gpu_profiling_info.large_kernel:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s',
+                  f'{gpu_profiling_info.compute_time:.3f}s', f'{gpu_profiling_info.communication_not_overlapped: .3f}s',
                   f'{gpu_profiling_info.scheduling_time:.3f}', f'{gpu_profiling_info.scheduling_ratio:.2%}',
                   f'{gpu_profiling_info.memory_used:.2f}G', f'{gpu_profiling_info.e2e_time:.3f}s'])
     table.add_row(['当前现状', f'{npu_profiling_info.cube_time:.3f}s', f'{npu_profiling_info.vector_time:.3f}s',
-                  f'{npu_profiling_info.large_kernel:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s',
+                  f'{npu_profiling_info.compute_time:.3f}s', f'{npu_profiling_info.communication_not_overlapped: .3f}s',
                   f'{npu_profiling_info.scheduling_time:.3f}', f'{npu_profiling_info.scheduling_ratio:.2%}',
                   f'{npu_profiling_info.memory_used:.2f}G', f'{npu_profiling_info.e2e_time:.3f}s'])
     print(table)
-- 
Gitee


From c99f2e7f8129daea9a23396a79144d51697c5aee Mon Sep 17 00:00:00 2001
From: menff <menfeifei@huawei.com>
Date: Mon, 10 Jul 2023 17:10:15 +0800
Subject: [PATCH 2/7] =?UTF-8?q?=E6=9B=B4=E6=96=B0readme?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 debug/tools/profiling_analyse/README.md     | 8 +++++---
 debug/tools/profiling_analyse/npu_parser.py | 5 +++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md
index d871f0198..5a6fd5e15 100644
--- a/debug/tools/profiling_analyse/README.md
+++ b/debug/tools/profiling_analyse/README.md
@@ -22,13 +22,15 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。
 2、当前仅统计算子运行在vector和cube上的耗时。
 3、这2中算子于csv文件中的的TaskType均为AI_CORE，其中aiv_vec_time时间多表明为vector算子，aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。
+4、算子若无pmu信息，仅统计ai_core的总耗时并显示在结果"vector算子"一列
 
 ### 通信
-此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的EVENT_WAIT_SQE，对于多个Stream Id的结果，取Stream Id最小值。
-输出结果为该字段时间求和。
+1、此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下同一条流的EVENT_WAIT_SQE总耗时。
+2、选取trace_view中的计算流——即流中同时存在EVENT_WAIT_SQE和Task Type为AI_CORE的流
+3、对于AI_CORE存在2条流中的情况，计算流中累加EVENT_WAIT_SQE时会减去同时间区间内另外流产生的AI_CORE耗时
 
 ### 计算流e2e耗时
-此耗时通过统计trace_view.json中时间戳‘ts’的最小值和最大值，其时间差的绝对值即为e2e耗时。
+此耗时通过统计trace_view.json中compute_time时间戳‘ts’的最小值和最大值，其时间差的绝对值即为e2e耗时。
 
 ### 调度占比
 1、调度占比的求取需先计算调度耗时，调度占比=调度耗时/e2e耗时 * 100%。
diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index 13ab4c9cc..9bf051509 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -28,10 +28,11 @@ class NpuProfilingParser:
         for dic in data:
             self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res)
             if ('name' in dic) and (dic.get('name') == 'compute_time'):
-                compute_time += dic.get('dur')
                 ts = dic.get('ts')
+                dur = dic.get('dur')
+                compute_time += dur
                 min_ts = ts if ts < min_ts else min_ts
-                max_ts = ts if ts > max_ts else max_ts
+                max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts
         # AI_CORE和EVENT_WAIT_SQE共存为计算流
         compute_stream = []
         parallel_stream = []
-- 
Gitee


From ef94381cc56954821500cfd6682c4c0fe0190f04 Mon Sep 17 00:00:00 2001
From: menff <menfeifei@huawei.com>
Date: Mon, 10 Jul 2023 17:22:10 +0800
Subject: [PATCH 3/7] =?UTF-8?q?=E4=BF=AE=E6=AD=A3bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 debug/tools/profiling_analyse/npu_parser.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index 9bf051509..9ccd73fff 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -38,7 +38,7 @@ class NpuProfilingParser:
         parallel_stream = []
         # 不存在算子并行的情况
         if len(ai_core_dict) == 1:
-            compute_stream.appen(min(ai_core_dict.keys()))
+            compute_stream.append(min(ai_core_dict.keys()))
         elif len(ai_core_dict) == 2:  # 2个ai_core，存在并行流（当前最多2条算子计算流）
             compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys())
             parallel_stream = list(ai_core_dict.keys() - set(compute_stream))
@@ -48,7 +48,7 @@ class NpuProfilingParser:
             sorted(cs_event_wait_sqe_list, key=lambda x: (x[0]))
             sorted(cs_ai_core_list, key=lambda x: (x[0]))
             self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list)
-        self.profiling.compute_time = compute_time / 10 ** 6
+        self.profiling_info.compute_time = compute_time / 10 ** 6
         self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 / 1000
         self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - 
             self.parallel_time) / 10 ** 6
@@ -108,8 +108,8 @@ class NpuProfilingParser:
         j = 0
         while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list):
             lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0])
-            hi = max(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1])
-            if lo < hi:
+            hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1])
+            if lo <= hi:
                 ans += (hi - lo)
             if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]:
                 i += 1
-- 
Gitee


From 7eeba3f7985ed481b85709e95c09605f44d85a34 Mon Sep 17 00:00:00 2001
From: menff <menfeifei@huawei.com>
Date: Mon, 10 Jul 2023 17:28:26 +0800
Subject: [PATCH 4/7] bugfix

---
 debug/tools/profiling_analyse/npu_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index 9ccd73fff..501bef19a 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -130,5 +130,5 @@ class NpuProfilingParser:
             if args.get('Task Type') == 'EVENT_WAIT_SQE':
                 res[stream_id] += dur
                 event_wait_sqe[stream_id].append([ts, ts + dur])
-            elif:
+            elif args.get('Task Type') == 'AI_CORE':
                 ai_core_dict[stream_id].append([ts, ts + dur])
\ No newline at end of file
-- 
Gitee


From b6e6c5693fd891aee47711d3c9f3af88f1703bb4 Mon Sep 17 00:00:00 2001
From: menff <menfeifei@huawei.com>
Date: Mon, 10 Jul 2023 20:07:21 +0800
Subject: [PATCH 5/7] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=97=A0pmu=E4=BF=A1?=
 =?UTF-8?q?=E6=81=AF=E6=97=B6cube=E7=AE=97=E5=AD=90=E7=99=BD=E5=90=8D?=
 =?UTF-8?q?=E5=8D=95=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 debug/tools/profiling_analyse/README.md          |  7 ++++---
 debug/tools/profiling_analyse/npu_parser.py      | 16 +++++++++++-----
 debug/tools/profiling_analyse/profiling_parse.py |  3 ++-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md
index 5a6fd5e15..de39ab939 100644
--- a/debug/tools/profiling_analyse/README.md
+++ b/debug/tools/profiling_analyse/README.md
@@ -22,7 +22,8 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。
 2、当前仅统计算子运行在vector和cube上的耗时。
 3、这2中算子于csv文件中的的TaskType均为AI_CORE，其中aiv_vec_time时间多表明为vector算子，aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。
-4、算子若无pmu信息，仅统计ai_core的总耗时并显示在结果"vector算子"一列
+4、算子若无pmu信息，会根据cube算子类型进行区分，当前已知的算子类型为['MatMul', 'BatchMatMul']
+5、用户有添加cube算子的要求，可以使用可选入参'-aop'添加算子名称，使用方式见样例。
 
 ### 通信
 1、此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下同一条流的EVENT_WAIT_SQE总耗时。
@@ -44,5 +45,5 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 ## 样例
 - step1:获取gpu和npu的profiling数据，若采集profiling数据时没开启memory采集开关，则没有内存使用数据
 
-- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2
-- 输出结果：可以得到gpu与npu对照的打屏性能拆解数据
+- 运行命令:python profiling_parse.py -g gpu\gpu_trace_device0.json -glt 0.9 -n npu\xxx_ascend_pt -nlt 1.2 -aop op1 op2
+- 输出结果：可以得到gpu与npu对照的打屏性能拆解数据，其中-nlt为输入打屏时间，-aop为手动添加的cube算子类型
diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index 501bef19a..ede8e8763 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -5,7 +5,7 @@ import parser_helper
 
 
 class NpuProfilingParser:
-    def __init__(self, npu_step_time, npu_file_path):
+    def __init__(self, npu_step_time, add_cube_time, npu_file_path):
         self.npu_json_file = npu_file_path.get('trace_view')
         self.npu_summary_file = npu_file_path.get('op_summary')
         self.npu_mem_file = npu_file_path.get('memory_record')
@@ -13,6 +13,8 @@ class NpuProfilingParser:
         self.npu_step_time = npu_step_time
         self.parallel_time = 0
         self.aicore_time = 0
+        self.cube_op_type = ['MatMul', 'BatchMatMul']
+        self.cube_op_type = list(set(self.cube_op_type + add_cube_time))
 
     def parse_npu_json_events(self):
         if not self.npu_json_file:
@@ -70,7 +72,8 @@ class NpuProfilingParser:
         ai_core_time = 0.0
         vec_mac_flag = True  # True标记当前summary文件中存在pmu信息
         if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None:
-            print('当前的profiling结果可能是极简模式,无法区分cube和vector,总的ai_core耗时会展示在vector算子列')
+            print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:')
+            print(cube_op_type)
             vec_mac_flag = False
         for i in range(len(info['Model ID'])):
             task_type = info.loc[i, 'Task Type']
@@ -78,7 +81,9 @@ class NpuProfilingParser:
                 continue
             task_durations = info.loc[i, 'Task Duration(us)']
             ai_core_time += task_durations
-            if not vec_mac_flag:
+            op_type = info.loc[i, 'OP Type']
+            if not vec_mac_flag:  # 如果是极简模式根据OP_Type计算完cube time后提前返回
+                cube_time += task_durations if op_type in self.cube_op_type else 0.0
                 continue
             aiv_vec_time = info.loc[i, 'aiv_vec_time(us)']
             if aiv_vec_time > 0:
@@ -88,7 +93,8 @@ class NpuProfilingParser:
             cube_time = (ai_core_time - vec_time) / 10 ** 6
             vec_time /= 10 ** 6
         else:
-            vec_time = ai_core_time / 10 ** 6
+            vec_time = (ai_core_time - cube_time) / 10 ** 6
+            cube_time /= 10 ** 6
         self.profiling_info.cube_time = cube_time
         self.profiling_info.vector_time = vec_time
         if not self.npu_mem_file:
@@ -131,4 +137,4 @@ class NpuProfilingParser:
                 res[stream_id] += dur
                 event_wait_sqe[stream_id].append([ts, ts + dur])
             elif args.get('Task Type') == 'AI_CORE':
-                ai_core_dict[stream_id].append([ts, ts + dur])
\ No newline at end of file
+                ai_core_dict[stream_id].append([ts, ts + dur])
diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py
index b5dc9c385..2f7734953 100644
--- a/debug/tools/profiling_analyse/profiling_parse.py
+++ b/debug/tools/profiling_analyse/profiling_parse.py
@@ -16,6 +16,7 @@ def parse_command():
                         help='Npu single core profiling root path.')
     parser.add_argument('-nlt', '--npu_log_time', required=False, default=0.0, metavar='(FILE)', type=float, 
                         help='Npu one step time(s).')
+    parser.add_argument('-aop', '--add_cube_op', required=False, default=[], nargs='*', help='add cube op name')
     return parser.parse_args()
 
 
@@ -47,7 +48,7 @@ def parse_gpu(args):
 
 
 def parse_npu(args, npu_path):
-    npu_parser = NpuProfilingParser(args.npu_log_time, npu_path)
+    npu_parser = NpuProfilingParser(args.npu_log_time, args.add_cube_op, npu_path)
     npu_parser.parse_npu_csv_events()
     npu_parser.parse_npu_json_events()
     return npu_parser.profiling_info
-- 
Gitee


From 95b2e5e76e7d36d6b4ae5f3e22e39f94bad741d3 Mon Sep 17 00:00:00 2001
From: menff <menfeifei@huawei.com>
Date: Mon, 10 Jul 2023 20:12:14 +0800
Subject: [PATCH 6/7] bugfix

---
 debug/tools/profiling_analyse/npu_parser.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index ede8e8763..54b2b506d 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -5,7 +5,7 @@ import parser_helper
 
 
 class NpuProfilingParser:
-    def __init__(self, npu_step_time, add_cube_time, npu_file_path):
+    def __init__(self, npu_step_time, add_cube_name, npu_file_path):
         self.npu_json_file = npu_file_path.get('trace_view')
         self.npu_summary_file = npu_file_path.get('op_summary')
         self.npu_mem_file = npu_file_path.get('memory_record')
@@ -14,7 +14,7 @@ class NpuProfilingParser:
         self.parallel_time = 0
         self.aicore_time = 0
         self.cube_op_type = ['MatMul', 'BatchMatMul']
-        self.cube_op_type = list(set(self.cube_op_type + add_cube_time))
+        self.cube_op_type = list(set(self.cube_op_type + add_cube_name))
 
     def parse_npu_json_events(self):
         if not self.npu_json_file:
@@ -73,7 +73,7 @@ class NpuProfilingParser:
         vec_mac_flag = True  # True标记当前summary文件中存在pmu信息
         if info.get('aic_mac_time(us)') is None or info.get('aiv_vec_time(us)') is None:
             print('当前的profiling结果可能是极简模式,通过cube算子白名单进行区分,白名单如下:')
-            print(cube_op_type)
+            print(self.cube_op_type)
             vec_mac_flag = False
         for i in range(len(info['Model ID'])):
             task_type = info.loc[i, 'Task Type']
@@ -123,7 +123,6 @@ class NpuProfilingParser:
                 j += 1
         return ans
 
-
     @staticmethod
     def get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, res):
         if not dic.get('args'):
-- 
Gitee


From 2e0e76969966cf5be698532ebcc943644d5c8a99 Mon Sep 17 00:00:00 2001
From: menff <menfeifei@huawei.com>
Date: Mon, 10 Jul 2023 21:03:39 +0800
Subject: [PATCH 7/7] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=BC=82=E5=B8=B8?=
 =?UTF-8?q?=E4=BF=9D=E6=8A=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 debug/tools/profiling_analyse/npu_parser.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index 54b2b506d..bd8d8350f 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -23,6 +23,7 @@ class NpuProfilingParser:
         compute_time = 0
         min_ts = sys.float_info.max
         max_ts = sys.float_info.min
+        ts_flag = False  # 表明没有获取到compute time的耗时
         data = parser_helper.read_json_file(self.npu_json_file)
         event_wait_sqe = defaultdict(list)
         ai_core_dict = defaultdict(list)
@@ -30,6 +31,7 @@ class NpuProfilingParser:
         for dic in data:
             self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res)
             if ('name' in dic) and (dic.get('name') == 'compute_time'):
+                ts_flag = True
                 ts = dic.get('ts')
                 dur = dic.get('dur')
                 compute_time += dur
@@ -51,7 +53,7 @@ class NpuProfilingParser:
             sorted(cs_ai_core_list, key=lambda x: (x[0]))
             self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list)
         self.profiling_info.compute_time = compute_time / 10 ** 6
-        self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 / 1000
+        self.profiling_info.e2e_time = (max_ts - min_ts) / 1000 ** 6 if ts_flag else 0
         self.profiling_info.communication_not_overlapped = (event_wait_sqe_res[compute_stream[0]] - 
             self.parallel_time) / 10 ** 6
         time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \
@@ -60,7 +62,8 @@ class NpuProfilingParser:
             self.profiling_info.scheduling_time = self.npu_step_time - time_required
         else:
             self.profiling_info.scheduling_time = self.profiling_info.e2e_time - time_required
-        self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time
+        self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time \
+            if self.profiling_info.e2e_time != 0 else 0
 
     def parse_npu_csv_events(self):
         if not self.npu_summary_file:
-- 
Gitee