From 09caae86879515ebe784c6dd6e4458da655297e4 Mon Sep 17 00:00:00 2001 From: menff Date: Thu, 6 Jul 2023 17:12:17 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E9=80=9A=E4=BF=A1=E4=B8=8D?= =?UTF-8?q?=E6=8E=A9=E7=9B=96=E8=80=97=E6=97=B6=E7=9A=84=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E6=96=B9=E6=B3=95=EF=BC=8C=E5=A2=9E=E5=8A=A0=E6=89=93=E5=B1=8F?= =?UTF-8?q?=E8=BE=93=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/tools/profiling_analyse/README.md | 6 ++--- debug/tools/profiling_analyse/npu_parser.py | 23 +++++++++++-------- .../profiling_analyse/profiling_parse.py | 10 ++++---- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md index 4d2e62d2a96..8f809597d69 100644 --- a/debug/tools/profiling_analyse/README.md +++ b/debug/tools/profiling_analyse/README.md @@ -24,14 +24,14 @@ gpu上的内存使用可以使用nvidia-smi查看,使用json文件分析时需 ## npu性能数据解析 ### 算子耗时 1、算子耗时profiling数据位于/PROFxxx/device_x/summary路径下的op_summary_x_1.csv文件中。 -2、当前仅统计算子运行在vector和cube上的耗时。、 +2、当前仅统计算子运行在vector和cube上的耗时。 3、这2中算子于csv文件中的的TaskType均为AI_CORE,其中aiv_vec_time时间多表明为vector算子,aic_mac_time表明为cube算子。分别累加求和算子耗时进行输出。 ### 大kernel算子 待补充大kernel算子列表 ### 通信 -此处的通信为通信未掩盖耗时,对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的communication_not_overlapped。 +此处的通信为通信未掩盖耗时,对应为ASCEND_PROFILER_OUTPUT/trace_view.json下的EVENT_WAIT_SQE,对于多个Stream Id的结果,取Stream Id最小值。 输出结果为该字段时间求和。 ### 计算流e2e耗时 @@ -40,7 +40,7 @@ gpu上的内存使用可以使用nvidia-smi查看,使用json文件分析时需 ### 调度占比 1、调度占比的求取需先计算调度耗时,调度占比=调度耗时/e2e耗时 * 100%。 2、调度耗时的计算方法有2种,①调度耗时=单步打屏时间-算子耗时-通信不可掩盖耗时,②调度耗时=e2e耗时-计算流执行任务总耗时。 -3、由于”单步打屏时间“需额外记录输入,暂不使用方法①,方法②中的计算流执行任务总耗时即为trace_view.json下的compute_time。 +3、由于”单步打屏时间“需额外记录输入,增加可选输入字段“-ns”,作为用户的可选输入“单步打屏时间”,若无输入,该值使用e2e耗时替代。 ### 内存 1、内存统计的数据来源于ASCEND_PROFILER_OUTPUT/memory_record.csv中的”Total Reserved(MB)“。 diff --git a/debug/tools/profiling_analyse/npu_parser.py b/debug/tools/profiling_analyse/npu_parser.py index 7e5949812ac..417798cc5a5 100644 --- a/debug/tools/profiling_analyse/npu_parser.py +++ b/debug/tools/profiling_analyse/npu_parser.py @@ -4,31 +4,34 @@ import parser_helper class NpuProfilingParser: - def __init__(self, npu_file_path): + def __init__(self, npu_step_time, npu_file_path): self.npu_json_file = npu_file_path.get('trace_view') self.npu_summary_file = npu_file_path.get('op_summary') self.npu_mem_file = npu_file_path.get('memory_record') self.profiling_info = parser_helper.ProfilingInfo() + self.npu_step_time = npu_step_time def parse_npu_json_events(self): - conn_time = 0.0 - compute_time = 0.0 + event_wait_sqe = {} min_ts = sys.float_info.max max_ts = sys.float_info.min data = parser_helper.read_json_file(self.npu_json_file) for dic in data: - if dic.get('name') == 'communication_not_overlapped': - conn_time += float(dic.get('dur')) - if dic.get('name') == 'compute_time': - compute_time += float(dic.get('dur')) + if dic.get('name') == 'EVENT_WAIT_SQE': + args = dic.get('args') + stream_id = args.get('Stream Id') + event_wait_sqe[stream_id] = (event_wait_sqe[stream_id] + dic.get('dur')) if \ + event_wait_sqe.get(stream_id) else dic.get('dur') if dic.get('ts'): ts = dic.get('ts') min_ts = ts if ts < min_ts else min_ts max_ts = ts if ts > max_ts else max_ts self.profiling_info.e2e_time = (max_ts - min_ts) / 10 ** 6 - self.profiling_info.communication_not_overlapped = conn_time / 10 ** 6 - compute_time = compute_time / 10 ** 6 - self.profiling_info.scheduling_time = self.profiling_info.e2e_time - compute_time + self.profiling_info.communication_not_overlapped = event_wait_sqe.get(min(event_wait_sqe)) / 10 ** 6 + time_required = (self.profiling_info.cube_time + self.profiling_info.vector_time) + \ + self.profiling_info.communication_not_overlapped + self.profiling_info.scheduling_time = (self.npu_step_time - time_required) if self.npu_step_time \ + else (self.profiling_info.e2e_time - time_required) self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time def parse_npu_csv_events(self): diff --git a/debug/tools/profiling_analyse/profiling_parse.py b/debug/tools/profiling_analyse/profiling_parse.py index 0d2dde05a0e..5d1c76ac6cc 100644 --- a/debug/tools/profiling_analyse/profiling_parse.py +++ b/debug/tools/profiling_analyse/profiling_parse.py @@ -13,6 +13,8 @@ def parse_command(): parser.add_argument('-g', '--gpu', required=False, default='', metavar='(FILE)', help='Gpu profiling json file.') parser.add_argument('-n', '--npu', required=False, default='', metavar='(FILE)', help='Npu single core profiling root path.') + parser.add_argument('-ns', '--npu_step', required=False, default='', metavar='(FILE)', + help='Npu one step time(s).') return parser.parse_args() @@ -41,7 +43,7 @@ def parse_gpu(args): return ProfilingInfo() -def parse_npu(npu_path): +def parse_npu(args, npu_path): if not npu_path.get('trace_view'): print('Npu trace json file is not available.') return ProfilingInfo() @@ -51,9 +53,9 @@ def parse_npu(npu_path): if not npu_path.get('memory_record'): print('Npu op memory csv file is not available.') return ProfilingInfo() - npu_parser = NpuProfilingParser(npu_path) - npu_parser.parse_npu_json_events() + npu_parser = NpuProfilingParser(args.npu_step, npu_path) npu_parser.parse_npu_csv_events() + npu_parser.parse_npu_json_events() return npu_parser.profiling_info @@ -68,7 +70,7 @@ def main(): npu_path['memory_record'] = os.path.join(root, file) if 'op_summary' in file: npu_path['op_summary'] = os.path.join(root, file) - show_table(parse_gpu(args), parse_npu(npu_path)) + show_table(parse_gpu(args), parse_npu(args, npu_path)) if __name__ == '__main__': -- Gitee