From c5c52fec3ad77d04a643217e49dd1b4d06f78b08 Mon Sep 17 00:00:00 2001 From: zhouyanming Date: Mon, 10 Jul 2023 21:36:24 +0800 Subject: [PATCH 1/6] fix e2e time --- debug/tools/profiling_analyse/gpu_parser.py | 32 +++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py index ffcee87ed..f1598ac68 100644 --- a/debug/tools/profiling_analyse/gpu_parser.py +++ b/debug/tools/profiling_analyse/gpu_parser.py @@ -1,4 +1,4 @@ -from collections import defaultdict +from collections import Counter, defaultdict import pandas as pd import parser_helper @@ -7,6 +7,7 @@ import parser_helper class GpuProfilingParser: def __init__(self, args): self.trace_events = self.read_profiling_json_file(args.gpu) + self.compute_stream_id = self.infer_compute_stream_id() self.one_step_time = args.gpu_log_time self.profiling_info = parser_helper.ProfilingInfo() @@ -22,12 +23,12 @@ class GpuProfilingParser: all_op_time = 0.0 op_list = [] compute_stream_dur = 0.0 # 计算流耗时 - marks = defaultdict(int) # mark for compute communication_not_overlapped time + marks = defaultdict(int) # mark for compute communication_not_overlapped time for event in self.trace_events: if not isinstance(event, dict): continue - if event.get('args') and event.get('args').get('stream') == 7: + if event.get('args') and event.get('args').get('stream') == self.compute_stream_id: compute_stream_dur += float(event.get('dur')) if not {'name', 'cat', 'dur', 'ts'} < event.keys(): continue @@ -39,18 +40,19 @@ class GpuProfilingParser: continue if 'nccl' in name: for timestep in range(ts + 1, ts + dur + 1): - marks[str(timestep)] += 1 # mark this timestep in communication stream + marks[str(timestep)] += 1 # mark this timestep in communication stream continue else: for timestep in range(ts + 1, ts + dur + 1): - marks[str(timestep)] += -100 # mark this timestep in compute stream + marks[str(timestep)] += -100 # mark this timestep in compute stream if 'gemm' in name: cube_time += float(dur) all_op_time += float(dur) op_list.append([ts, name, cat, dur]) op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur']) op_dataframe.to_csv('gpu_perf.csv', index=False) - self.profiling_info.communication_not_overlapped = len([_ for _,value in marks.items() if value > 0]) / 10 ** 6 + self.profiling_info.compute_time = compute_stream_dur / 10 ** 6 + self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6 self.profiling_info.cube_time = cube_time / 10 ** 6 self.profiling_info.vector_time = (all_op_time - cube_time) / 10 ** 6 self.parse_e2e_time() @@ -64,8 +66,13 @@ class GpuProfilingParser: self.parse_memory_reserved() def parse_e2e_time(self): - timeline = sorted(self.trace_events, key=lambda event: event.get('ts')) - self.profiling_info.e2e_time = (timeline[-1].get('ts') - timeline[0].get('ts')) / 10 ** 6 + # timeline = sorted(self.trace_events, key=lambda event: event.get('ts')) + # self.profiling_info.e2e_time = (timeline[-1].get('ts') - timeline[0].get('ts')) / 10 ** 6 + compute_events_timeline = [event for event in self.trace_events if + event.get('args') and event.get('args').get('stream') == self.compute_stream_id] + compute_events_timeline = sorted(compute_events_timeline, key=lambda event: event.get('ts')) + self.profiling_info.e2e_time = (compute_events_timeline[-1].get('ts') + compute_events_timeline[-1].get('dur') - + compute_events_timeline[0].get('ts')) / 10 ** 6 def parse_memory_reserved(self): memories = [event.get('args').get('Total Reserved') for event in self.trace_events if @@ -74,3 +81,12 @@ class GpuProfilingParser: print("Gpu profiling data doesn't contain memory info") return self.profiling_info.memory_used = max(memories) / 1024 ** 3 + + def infer_compute_stream_id(self): + kernel_stream_ids = [event.get('args').get('stream') for event in self.trace_events if + event.get('cat') == 'Kernel' and 'nccl' not in event.get('name') + and event.get('args') and event.get('args').get('stream')] + if not kernel_stream_ids: + raise RuntimeError('The profiling data does nor contain kernel running data.') + counter = Counter(kernel_stream_ids) + return counter.most_common(1)[0][0] \ No newline at end of file -- Gitee From e930309ff03586ab8983aa2a61a421422bad7bc0 Mon Sep 17 00:00:00 2001 From: zhouyanming Date: Tue, 11 Jul 2023 11:05:28 +0800 Subject: [PATCH 2/6] fix e2e time --- debug/tools/profiling_analyse/gpu_parser.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py index f1598ac68..cc206c84f 100644 --- a/debug/tools/profiling_analyse/gpu_parser.py +++ b/debug/tools/profiling_analyse/gpu_parser.py @@ -66,8 +66,6 @@ class GpuProfilingParser: self.parse_memory_reserved() def parse_e2e_time(self): - # timeline = sorted(self.trace_events, key=lambda event: event.get('ts')) - # self.profiling_info.e2e_time = (timeline[-1].get('ts') - timeline[0].get('ts')) / 10 ** 6 compute_events_timeline = [event for event in self.trace_events if event.get('args') and event.get('args').get('stream') == self.compute_stream_id] compute_events_timeline = sorted(compute_events_timeline, key=lambda event: event.get('ts')) @@ -87,6 +85,6 @@ class GpuProfilingParser: event.get('cat') == 'Kernel' and 'nccl' not in event.get('name') and event.get('args') and event.get('args').get('stream')] if not kernel_stream_ids: - raise RuntimeError('The profiling data does nor contain kernel running data.') + raise RuntimeError('The profiling data does not contain kernel running data.') counter = Counter(kernel_stream_ids) - return counter.most_common(1)[0][0] \ No newline at end of file + return counter.most_common(1)[0][0] -- Gitee From 6473ae429c1ecd1c4943029d880d6a4873b8de4b Mon Sep 17 00:00:00 2001 From: zhouyanming Date: Tue, 11 Jul 2023 15:58:13 +0800 Subject: [PATCH 3/6] fix e2e time --- debug/tools/profiling_analyse/gpu_parser.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py index cc206c84f..aef2b7fe5 100644 --- a/debug/tools/profiling_analyse/gpu_parser.py +++ b/debug/tools/profiling_analyse/gpu_parser.py @@ -81,9 +81,12 @@ class GpuProfilingParser: self.profiling_info.memory_used = max(memories) / 1024 ** 3 def infer_compute_stream_id(self): - kernel_stream_ids = [event.get('args').get('stream') for event in self.trace_events if - event.get('cat') == 'Kernel' and 'nccl' not in event.get('name') - and event.get('args') and event.get('args').get('stream')] + kernel_stream_ids = [] + for event in self.trace_events: + is_kernel_exec_event = event.get('cat') == 'Kernel' and 'nccl' not in event.get('name') + has_stream_id_event = event.get('args') and event.get('args').get('stream') + if is_kernel_exec_event and has_stream_id_event: + kernel_stream_ids.append(event) if not kernel_stream_ids: raise RuntimeError('The profiling data does not contain kernel running data.') counter = Counter(kernel_stream_ids) -- Gitee From 2c8f92d32b28962022495dc07510b7c508ff22d0 Mon Sep 17 00:00:00 2001 From: zhouyanming Date: Tue, 11 Jul 2023 16:03:10 +0800 Subject: [PATCH 4/6] fix e2e time --- debug/tools/profiling_analyse/gpu_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py index aef2b7fe5..af4a72ade 100644 --- a/debug/tools/profiling_analyse/gpu_parser.py +++ b/debug/tools/profiling_analyse/gpu_parser.py @@ -86,7 +86,7 @@ class GpuProfilingParser: is_kernel_exec_event = event.get('cat') == 'Kernel' and 'nccl' not in event.get('name') has_stream_id_event = event.get('args') and event.get('args').get('stream') if is_kernel_exec_event and has_stream_id_event: - kernel_stream_ids.append(event) + kernel_stream_ids.append(event.get('args').get('stream')) if not kernel_stream_ids: raise RuntimeError('The profiling data does not contain kernel running data.') counter = Counter(kernel_stream_ids) -- Gitee From 7c0cad9d5249b78e5f870a44e5b18468f5fea399 Mon Sep 17 00:00:00 2001 From: zhouyanming Date: Tue, 11 Jul 2023 16:11:34 +0800 Subject: [PATCH 5/6] fix e2e time --- debug/tools/profiling_analyse/gpu_parser.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py index af4a72ade..977d7a1d7 100644 --- a/debug/tools/profiling_analyse/gpu_parser.py +++ b/debug/tools/profiling_analyse/gpu_parser.py @@ -73,8 +73,10 @@ class GpuProfilingParser: compute_events_timeline[0].get('ts')) / 10 ** 6 def parse_memory_reserved(self): - memories = [event.get('args').get('Total Reserved') for event in self.trace_events if - event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0] + memories = [ + event.get('args').get('Total Reserved') for event in self.trace_events + if event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0 + ] if not memories: print("Gpu profiling data doesn't contain memory info") return -- Gitee From 1f2e87682512a450f584f00942609363bf79cbc3 Mon Sep 17 00:00:00 2001 From: zhouyanming Date: Tue, 11 Jul 2023 19:26:45 +0800 Subject: [PATCH 6/6] fix e2e time --- debug/tools/profiling_analyse/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md index de39ab939..fea5ff779 100644 --- a/debug/tools/profiling_analyse/README.md +++ b/debug/tools/profiling_analyse/README.md @@ -5,10 +5,12 @@ 1. 获取json文件中的traceEvents字段,获取所有cat为kernel的,且name中不包含nccl的event,将他们的耗时相加即为所有算子耗时all_time; 2. 取算子中name包含gemm的为cube算子耗时cube_time. 3. vector算子耗时即为(all_time - cube_time) +### 计算流耗时 +获取计算流所有event的耗时总和 ### 通信 此处的通信指通信未掩盖耗时,通过计算有通信流而无计算流的时间戳获得 ### 计算流e2e耗时 -按照时间也就是'ts'字段排序所有events,可以看到最后的event是Record Window End,故使用最后一个event的ts值减去第一个event的ts值作为计算流e2e耗时 +取计算流的所有event,用最后一个event的ts加上耗时dur减去第一个event的ts作为计算流的e2e耗时 ### 调度 gpu上的调度耗时计算方法采用:调度耗时 = 单步打屏时间 - 算子耗时 - 通信不可掩盖耗时 单步打屏时间需要用户输入,当用户不输入时,采用e2e耗时代替单步打屏时间 -- Gitee