diff --git a/debug/tools/profiling_analyse/README.md b/debug/tools/profiling_analyse/README.md index de39ab939eb075df115734cd9a0b34f7468de5ab..fea5ff779c4b21161ccec6ef6781c4ea3cd0a1af 100644 --- a/debug/tools/profiling_analyse/README.md +++ b/debug/tools/profiling_analyse/README.md @@ -5,10 +5,12 @@ 1. 获取json文件中的traceEvents字段,获取所有cat为kernel的,且name中不包含nccl的event,将他们的耗时相加即为所有算子耗时all_time; 2. 取算子中name包含gemm的为cube算子耗时cube_time. 3. vector算子耗时即为(all_time - cube_time) +### 计算流耗时 +获取计算流所有event的耗时总和 ### 通信 此处的通信指通信未掩盖耗时,通过计算有通信流而无计算流的时间戳获得 ### 计算流e2e耗时 -按照时间也就是'ts'字段排序所有events,可以看到最后的event是Record Window End,故使用最后一个event的ts值减去第一个event的ts值作为计算流e2e耗时 +取计算流的所有event,用最后一个event的ts加上耗时dur减去第一个event的ts作为计算流的e2e耗时 ### 调度 gpu上的调度耗时计算方法采用:调度耗时 = 单步打屏时间 - 算子耗时 - 通信不可掩盖耗时 单步打屏时间需要用户输入,当用户不输入时,采用e2e耗时代替单步打屏时间 diff --git a/debug/tools/profiling_analyse/gpu_parser.py b/debug/tools/profiling_analyse/gpu_parser.py index ffcee87edaaafd050b9720ab314895e4f6851481..977d7a1d79ea0d4db948b039731041b98ba3e18f 100644 --- a/debug/tools/profiling_analyse/gpu_parser.py +++ b/debug/tools/profiling_analyse/gpu_parser.py @@ -1,4 +1,4 @@ -from collections import defaultdict +from collections import Counter, defaultdict import pandas as pd import parser_helper @@ -7,6 +7,7 @@ import parser_helper class GpuProfilingParser: def __init__(self, args): self.trace_events = self.read_profiling_json_file(args.gpu) + self.compute_stream_id = self.infer_compute_stream_id() self.one_step_time = args.gpu_log_time self.profiling_info = parser_helper.ProfilingInfo() @@ -22,12 +23,12 @@ class GpuProfilingParser: all_op_time = 0.0 op_list = [] compute_stream_dur = 0.0 # 计算流耗时 - marks = defaultdict(int) # mark for compute communication_not_overlapped time + marks = defaultdict(int) # mark for compute communication_not_overlapped time for event in self.trace_events: if not isinstance(event, dict): continue - if event.get('args') and event.get('args').get('stream') == 7: + if event.get('args') and event.get('args').get('stream') == self.compute_stream_id: compute_stream_dur += float(event.get('dur')) if not {'name', 'cat', 'dur', 'ts'} < event.keys(): continue @@ -39,18 +40,19 @@ class GpuProfilingParser: continue if 'nccl' in name: for timestep in range(ts + 1, ts + dur + 1): - marks[str(timestep)] += 1 # mark this timestep in communication stream + marks[str(timestep)] += 1 # mark this timestep in communication stream continue else: for timestep in range(ts + 1, ts + dur + 1): - marks[str(timestep)] += -100 # mark this timestep in compute stream + marks[str(timestep)] += -100 # mark this timestep in compute stream if 'gemm' in name: cube_time += float(dur) all_op_time += float(dur) op_list.append([ts, name, cat, dur]) op_dataframe = pd.DataFrame(op_list, columns=['time start', 'name', 'cat', 'dur']) op_dataframe.to_csv('gpu_perf.csv', index=False) - self.profiling_info.communication_not_overlapped = len([_ for _,value in marks.items() if value > 0]) / 10 ** 6 + self.profiling_info.compute_time = compute_stream_dur / 10 ** 6 + self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6 self.profiling_info.cube_time = cube_time / 10 ** 6 self.profiling_info.vector_time = (all_op_time - cube_time) / 10 ** 6 self.parse_e2e_time() @@ -64,13 +66,30 @@ class GpuProfilingParser: self.parse_memory_reserved() def parse_e2e_time(self): - timeline = sorted(self.trace_events, key=lambda event: event.get('ts')) - self.profiling_info.e2e_time = (timeline[-1].get('ts') - timeline[0].get('ts')) / 10 ** 6 + compute_events_timeline = [event for event in self.trace_events if + event.get('args') and event.get('args').get('stream') == self.compute_stream_id] + compute_events_timeline = sorted(compute_events_timeline, key=lambda event: event.get('ts')) + self.profiling_info.e2e_time = (compute_events_timeline[-1].get('ts') + compute_events_timeline[-1].get('dur') - + compute_events_timeline[0].get('ts')) / 10 ** 6 def parse_memory_reserved(self): - memories = [event.get('args').get('Total Reserved') for event in self.trace_events if - event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0] + memories = [ + event.get('args').get('Total Reserved') for event in self.trace_events + if event.get('name') == '[memory]' and event.get('args').get('Device Id') >= 0 + ] if not memories: print("Gpu profiling data doesn't contain memory info") return self.profiling_info.memory_used = max(memories) / 1024 ** 3 + + def infer_compute_stream_id(self): + kernel_stream_ids = [] + for event in self.trace_events: + is_kernel_exec_event = event.get('cat') == 'Kernel' and 'nccl' not in event.get('name') + has_stream_id_event = event.get('args') and event.get('args').get('stream') + if is_kernel_exec_event and has_stream_id_event: + kernel_stream_ids.append(event.get('args').get('stream')) + if not kernel_stream_ids: + raise RuntimeError('The profiling data does not contain kernel running data.') + counter = Counter(kernel_stream_ids) + return counter.most_common(1)[0][0]