From 09caae86879515ebe784c6dd6e4458da655297e4 Mon Sep 17 00:00:00 2001
From: menff <menfeifei@huawei.com>
Date: Thu, 6 Jul 2023 17:12:17 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E9=80=9A=E4=BF=A1=E4=B8=8D?=
 =?UTF-8?q?=E6=8E=A9=E7=9B=96=E8=80=97=E6=97=B6=E7=9A=84=E8=AE=A1=E7=AE=97?=
 =?UTF-8?q?=E6=96=B9=E6=B3=95=EF=BC=8C=E5=A2=9E=E5=8A=A0=E6=89=93=E5=B1=8F?=
 =?UTF-8?q?=E8=BE=93=E5=85=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 debug/tools/profiling_analyse/README.md       |  6 ++---
 debug/tools/profiling_analyse/npu_parser.py   | 23 +++++++++++--------
 .../profiling_analyse/profiling_parse.py      | 10 ++++----
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md
index 4d2e62d2a96..8f809597d69 100644
--- a/debug/tools/profiling_analyse/README.md
+++ b/debug/tools/profiling_analyse/README.md
@@ -24,14 +24,14 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 ## npu性能数据解析
 ### 算子耗时
 1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。
-2、当前仅统计算子运行在vector和cube上的耗时。、
+2、当前仅统计算子运行在vector和cube上的耗时。
 3、这2中算子于csv文件中的的TaskType均为AI_CORE，其中aiv_vec_time时间多表明为vector算子，aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。
 
 ### 大kernel算子
 待补充大kernel算子列表
 
 ### 通信
-此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的communication_not_overlapped。
+此处的通信为通信未掩盖耗时，对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的EVENT_WAIT_SQE，对于多个Stream Id的结果，取Stream Id最小值。
 输出结果为该字段时间求和。
 
 ### 计算流e2e耗时
@@ -40,7 +40,7 @@ gpu上的内存使用可以使用nvidia-smi查看，使用json文件分析时需
 ### 调度占比
 1、调度占比的求取需先计算调度耗时，调度占比=调度耗时/e2e耗时 * 100%。
 2、调度耗时的计算方法有2种，①调度耗时=单步打屏时间-算子耗时-通信不可掩盖耗时，②调度耗时=e2e耗时-计算流执行任务总耗时。
-3、由于”单步打屏时间“需额外记录输入，暂不使用方法①，方法②中的计算流执行任务总耗时即为trace_view.json下的compute_time。
+3、由于”单步打屏时间“需额外记录输入，增加可选输入字段“-ns”，作为用户的可选输入“单步打屏时间”，若无输入，该值使用e2e耗时替代。
 
 ### 内存
 1、内存统计的数据来源于ASCEND_PROFILER_OUTPUT/memory_record.csv中的”Total Reserved(MB)“。
diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py
index 7e5949812ac..417798cc5a5 100644
--- a/debug/tools/profiling_analyse/npu_parser.py
+++ b/debug/tools/profiling_analyse/npu_parser.py
@@ -4,31 +4,34 @@ import parser_helper
 
 
 class NpuProfilingParser:
-    def __init__(self, npu_file_path):
+    def __init__(self, npu_step_time, npu_file_path):
         self.npu_json_file = npu_file_path.get('trace_view')
         self.npu_summary_file = npu_file_path.get('op_summary')
         self.npu_mem_file = npu_file_path.get('memory_record')
         self.profiling_info = parser_helper.ProfilingInfo()
+        self.npu_step_time = npu_step_time
 
     def parse_npu_json_events(self):
-        conn_time = 0.0
-        compute_time = 0.0
+        event_wait_sqe = {}
         min_ts = sys.float_info.max
         max_ts = sys.float_info.min
         data = parser_helper.read_json_file(self.npu_json_file)
         for dic in data:
-            if dic.get('name') == 'communication_not_overlapped':
-                conn_time += float(dic.get('dur'))
-            if dic.get('name') == 'compute_time':
-                compute_time += float(dic.get('dur'))
+            if dic.get('name') == 'EVENT_WAIT_SQE':
+                args = dic.get('args')
+                stream_id = args.get('Stream Id')
+                event_wait_sqe[stream_id] = (event_wait_sqe[stream_id] + dic.get('dur')) if \
+                    event_wait_sqe.get(stream_id) else dic.get('dur')
             if dic.get('ts'):
                 ts = dic.get('ts')
                 min_ts = ts if ts < min_ts else min_ts
                 max_ts = ts if ts > max_ts else max_ts
         self.profiling_info.e2e_time = (max_ts - min_ts) / 10 ** 6
-        self.profiling_info.communication_not_overlapped = conn_time / 10 ** 6
-        compute_time = compute_time / 10 ** 6
-        self.profiling_info.scheduling_time = self.profiling_info.e2e_time - compute_time
+        self.profiling_info.communication_not_overlapped = event_wait_sqe.get(min(event_wait_sqe)) / 10 ** 6
+        time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \
+            self.profiling_info.communication_not_overlapped
+        self.profiling_info.scheduling_time = (self.npu_step_time - time_required) if self.npu_step_time \
+            else (self.profiling_info.e2e_time - time_required)
         self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time
 
     def parse_npu_csv_events(self):
diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py
index 0d2dde05a0e..5d1c76ac6cc 100644
--- a/debug/tools/profiling_analyse/profiling_parse.py
+++ b/debug/tools/profiling_analyse/profiling_parse.py
@@ -13,6 +13,8 @@ def parse_command():
     parser.add_argument('-g', '--gpu', required=False, default='', metavar='(FILE)', help='Gpu profiling json file.')
     parser.add_argument('-n', '--npu', required=False, default='', metavar='(FILE)',
                         help='Npu single core profiling root path.')
+    parser.add_argument('-ns', '--npu_step', required=False, default='', metavar='(FILE)', 
+                        help='Npu one step time(s).')
     return parser.parse_args()
 
 
@@ -41,7 +43,7 @@ def parse_gpu(args):
     return ProfilingInfo()
 
 
-def parse_npu(npu_path):
+def parse_npu(args, npu_path):
     if not npu_path.get('trace_view'):
         print('Npu trace json file is not available.')
         return ProfilingInfo()
@@ -51,9 +53,9 @@ def parse_npu(npu_path):
     if not npu_path.get('memory_record'):
         print('Npu op memory csv file is not available.')
         return ProfilingInfo()
-    npu_parser = NpuProfilingParser(npu_path)
-    npu_parser.parse_npu_json_events()
+    npu_parser = NpuProfilingParser(args.npu_step, npu_path)
     npu_parser.parse_npu_csv_events()
+    npu_parser.parse_npu_json_events()
     return npu_parser.profiling_info
 
 
@@ -68,7 +70,7 @@ def main():
                 npu_path['memory_record'] = os.path.join(root, file)
             if 'op_summary' in file:
                 npu_path['op_summary'] = os.path.join(root, file)
-    show_table(parse_gpu(args), parse_npu(npu_path))
+    show_table(parse_gpu(args), parse_npu(args, npu_path))
 
 
 if __name__ == '__main__':
-- 
Gitee