diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py index 4443562bd4edae71d30c0314ea22756a8d20b534..a9c8466725ec66a16da836acb9d29029713f9492 100644 --- a/profiler/compare_tools/profiling_analysis/gpu_parser.py +++ b/profiler/compare_tools/profiling_analysis/gpu_parser.py @@ -26,26 +26,33 @@ class OpTimeWarper: def __init__( self, cube_time: float = 0.0, + sdma_time: float = 0.0, + vec_time: float = 0.0, fa_time_fwd: float = 0.0, fa_time_bwd: float = 0.0, all_op_time: float = 0.0, compute_stream_dur: float = 0.0, cube_num: int = 0, - vec_num: int = 0 + vec_num: int = 0, + sdma_num: int = 0 ): self.cube_time = cube_time + self.sdma_time = sdma_time + self.vec_time = vec_time self.fa_time_fwd = fa_time_fwd self.fa_time_bwd = fa_time_bwd self.all_op_time = all_op_time self.compute_stream_dur = compute_stream_dur self.cube_num = cube_num self.vec_num = vec_num + self.sdma_num = sdma_num class GpuProfilingParser: NCCL_MARK = 'nccl' CUBE_MARK = 'gemm' FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel']] + SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)'] def __init__(self, gpu_path): self.trace_events = FileReader.read_trace_file(gpu_path).get('traceEvents') @@ -59,13 +66,22 @@ class GpuProfilingParser: return True return False + def is_sdma_time(self, name: str): + for mark in self.SDMA_MARK_LIST: + if mark in name.lower(): + return True + return False + def update_op_list(self, op_list, marks): cube_time = 0.0 all_op_time = 0.0 fa_time_bwd = 0.0 fa_time_fwd = 0.0 + sdma_time = 0.0 + vec_time = 0.0 cube_num = 0 vec_num = 0 + sdma_num = 0 compute_stream_dur = 0.0 for event in self.trace_events: if not isinstance(event, dict): @@ -78,6 +94,9 @@ class GpuProfilingParser: dur = event.get('dur') ts = event.get('ts') cat = event.get('cat', '') + if self.is_sdma_time(name): + sdma_time += float(dur) + sdma_num += 1 if cat.lower() != 'kernel': continue if self.NCCL_MARK in name.lower(): @@ -97,16 +116,20 @@ class GpuProfilingParser: cube_time += float(dur) else: vec_num += 1 + vec_time += float(dur) all_op_time += float(dur) op_list.append([ts, name, cat, dur]) time_wrapper = OpTimeWarper( cube_time=cube_time, + sdma_time=sdma_time, + vec_time=vec_time, fa_time_fwd=fa_time_fwd, fa_time_bwd=fa_time_bwd, all_op_time=all_op_time, compute_stream_dur=compute_stream_dur, cube_num=cube_num, - vec_num=vec_num + vec_num=vec_num, + sdma_num=sdma_num ) return time_wrapper @@ -122,15 +145,20 @@ class GpuProfilingParser: compute_stream_dur = time_wrapper.compute_stream_dur cube_num = time_wrapper.cube_num vec_num = time_wrapper.vec_num + sdma_num = time_wrapper.sdma_num + sdma_time = time_wrapper.sdma_time + vec_time = time_wrapper.vec_time self.profiling_info.compute_time = len([_ for _, value in marks.items() if value < 0]) / 10 ** 6 self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6 self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6 self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6 self.profiling_info.cube_time = cube_time / 10 ** 6 - self.profiling_info.vec_time = (all_op_time - cube_time - fa_time_fwd - fa_time_bwd) / 10 ** 6 + self.profiling_info.vec_time = self.profiling_info.compute_time - (cube_time + fa_time_fwd + fa_time_bwd) / 10 ** 6 self.profiling_info.cube_num = cube_num self.profiling_info.vec_num = vec_num + self.profiling_info.sdma_num = sdma_num + self.profiling_info.sdma_time = sdma_time / 10 ** 6 self.parse_e2e_time() self.profiling_info.scheduling_time = self.profiling_info.e2e_time - all_op_time / 10 ** 6 - \ diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py index 2c71b0dc4a5e10f5f40f70da618dd017cfc461f7..8c9b3e75c0546d387fc862731a2b47163df58ce8 100644 --- a/profiler/compare_tools/profiling_analysis/npu_parser.py +++ b/profiler/compare_tools/profiling_analysis/npu_parser.py @@ -27,6 +27,8 @@ class NpuInfoWrapper: self, compute_time: int, communication_time: int, + sdma_time: int, + sdma_num: int, is_cluster: bool, event_wait_sqe: dict, ai_core_dict: dict, @@ -35,6 +37,8 @@ class NpuInfoWrapper: ): self.compute_time = compute_time self.communication_time = communication_time + self.sdma_time = sdma_time + self.sdma_num = sdma_num self.is_cluster = is_cluster self.event_wait_sqe = event_wait_sqe self.ai_core_dict = ai_core_dict @@ -56,6 +60,30 @@ class NpuProfilingParser: self.aicore_time = 0 self.min_stream_ts = sys.float_info.max self.max_stream_ts = sys.float_info.min + self.sdma_sqe = defaultdict(float) + self.sdma_num_cnt = defaultdict(int) + + def get_sdma_para(self, sdma_sqe, sdma_num_cnt, ai_core_dict, event_wait_sqe) -> (float, int): + compute_stream = [] + parallel_stream = [] + sdma_time = 0.0 + sdma_parallel_time = 0.0 + sdma_num = 0 + sdma_parallel_num = 0 + if len(ai_core_dict) == 1: + compute_stream.append(min(ai_core_dict.keys())) + elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) + compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) + parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) + else: + print('[WARNING] Npu Compute Stream Num Error.') + if parallel_stream: + sdma_parallel_time = sdma_sqe[parallel_stream[0]] + sdma_parallel_num = sdma_num_cnt[parallel_stream[0]] + if compute_stream: + sdma_time = sdma_sqe[compute_stream[0]] + sdma_parallel_time + sdma_num = sdma_num_cnt[compute_stream[0]] + sdma_parallel_num + return sdma_time, sdma_num def parse_npu_json_events(self): if not self.npu_json_file: @@ -87,8 +115,9 @@ class NpuProfilingParser: communication_time += dur min_ts = ts if ts < min_ts else min_ts max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts + sdma_time, sdma_num = self.get_sdma_para(self.sdma_sqe, self.sdma_num_cnt, ai_core_dict, event_wait_sqe) npu_info_wrapper = NpuInfoWrapper( - compute_time, communication_time, is_cluster, + compute_time, communication_time, sdma_time, sdma_num, is_cluster, event_wait_sqe, ai_core_dict, event_wait_sqe_res, ai_core_res) self.update_npu_info(max_ts - min_ts, npu_info_wrapper) @@ -100,6 +129,8 @@ class NpuProfilingParser: ai_core_dict = npu_info_wrapper.ai_core_dict event_wait_sqe_res = npu_info_wrapper.event_wait_sqe_res ai_core_res = npu_info_wrapper.ai_core_res + sdma_time = npu_info_wrapper.sdma_time + sdma_num = npu_info_wrapper.sdma_num # AI_CORE和EVENT_WAIT_SQE共存为计算流 compute_stream = [] parallel_stream = [] @@ -121,11 +152,16 @@ class NpuProfilingParser: self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) self.profiling_info.compute_time = compute_time / 10 ** 6 if is_cluster else \ ai_core_res[compute_stream[0]] / 10 ** 6 + self.profiling_info.other_time = self.profiling_info.compute_time - self.profiling_info.cube_time - \ + self.profiling_info.flash_attention_time_fwd - self.profiling_info.flash_attention_time_bwd - \ + self.profiling_info.vec_time self.profiling_info.e2e_time = ts_dur / 10 ** 6 if is_cluster else \ (self.max_stream_ts - self.min_stream_ts) / 10 ** 6 self.profiling_info.communication_not_overlapped = communication_time / 10 ** 6 \ if is_cluster else (event_wait_sqe_res[compute_stream[0]] - self.parallel_time) / 10 ** 6 time_required = self.profiling_info.compute_time + self.profiling_info.communication_not_overlapped + self.profiling_info.sdma_time = sdma_time / 10 ** 6 + self.profiling_info.sdma_num = sdma_num if self.npu_step_time: self.profiling_info.scheduling_time = self.npu_step_time - time_required else: @@ -226,6 +262,9 @@ class NpuProfilingParser: if args.get('Task Type') == 'EVENT_WAIT_SQE': enent_wait_res[stream_id] += dur event_wait_sqe[stream_id].append([ts, ts + dur]) + elif args.get('Task Type') in ('SDMA_SQE', 'PCIE_DMA_SQE'): + self.sdma_sqe[stream_id] += dur + self.sdma_num_cnt[stream_id] += 1 elif args.get('Task Type') in ('AI_CORE', 'MIX_AIC', 'MIX_AIV', 'AI_CPU', 'AI_VECTOR_CORE', 'FFTS_PLUS'): ai_core_res[stream_id] += dur ai_core_dict[stream_id].append([ts, ts + dur]) diff --git a/profiler/compare_tools/profiling_analysis/parser_helper.py b/profiler/compare_tools/profiling_analysis/parser_helper.py index f46095ed0cada49034cea0469aa837a5d3e1c049..3b42d9308ce823a527168bd831946557d9f310f9 100644 --- a/profiler/compare_tools/profiling_analysis/parser_helper.py +++ b/profiler/compare_tools/profiling_analysis/parser_helper.py @@ -21,6 +21,7 @@ class ProfilingInfo: def __init__(self, profiling_type: str): self.profiling_type = profiling_type self.cube_time = 0.0 + self.other_time = 0.0 self.vec_time = 0.0 self.cube_num = 0 self.vec_num = 0 @@ -29,6 +30,7 @@ class ProfilingInfo: self.scheduling_ratio = 0.0 self.memory_used = 0.0 self.e2e_time = 0.0 + self.sdma_time = 0.0 self.scheduling_time = 0.0 self.flash_attention_time_bwd = 0.0 self.flash_attention_time_fwd = 0.0 diff --git a/profiler/compare_tools/profiling_analysis/profiling_parse.py b/profiler/compare_tools/profiling_analysis/profiling_parse.py index 70b435f7a6d79e6845a3cc4c660d881364979b40..434fba8309e95112e17ebdb9c92714a3ba545053 100644 --- a/profiler/compare_tools/profiling_analysis/profiling_parse.py +++ b/profiler/compare_tools/profiling_analysis/profiling_parse.py @@ -35,6 +35,10 @@ def generate_table_info(base_profiling_info, comp_profiling_info, table): f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})']) comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})', f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})']) + if base_profiling_info.other_time or comp_profiling_info.other_time: + headers.append('Other Time') + base_col.append(f'{base_profiling_info.other_time:.3f}s') + comp_col.append(f'{comp_profiling_info.other_time:.3f}s') if base_profiling_info.flash_attention_time_fwd or comp_profiling_info.flash_attention_time_fwd: headers.append('Flash Attention Time(Forward)') base_col.append(f'{base_profiling_info.flash_attention_time_fwd:.3f}s') @@ -50,18 +54,26 @@ def generate_table_info(base_profiling_info, comp_profiling_info, table): headers.append('Mem Usage') base_col.append(f'{base_profiling_info.memory_used:.2f}G') comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') + headers.extend(['Uncovered Communication Time']) + base_col.extend( + [f'{base_profiling_info.communication_not_overlapped: .3f}s']) + comp_col.extend( + [f'{comp_profiling_info.communication_not_overlapped: .3f}s']) + if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: + headers.append('SDMA Time(Num)') + base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') + comp_col.append(f'{comp_profiling_info.sdma_time:.3f}s({comp_profiling_info.sdma_num})') cue = '' if ((base_profiling_info.profiling_type == "NPU" and not base_profiling_info.minimal_profiling) or (comp_profiling_info.profiling_type == "NPU" and not comp_profiling_info.minimal_profiling)): + cue = '(Not minimal profiling)' - headers.extend(['Uncovered Communication Time', 'Free Time', 'E2E Time' + cue]) + headers.extend(['Free Time', 'E2E Time' + cue]) base_col.extend( - [f'{base_profiling_info.communication_not_overlapped: .3f}s', f'{base_profiling_info.scheduling_time:.3f}s', - f'{base_profiling_info.e2e_time:.3f}s']) + [f'{base_profiling_info.scheduling_time:.3f}s', f'{base_profiling_info.e2e_time:.3f}s']) comp_col.extend( - [f'{comp_profiling_info.communication_not_overlapped: .3f}s', f'{comp_profiling_info.scheduling_time:.3f}s', - f'{comp_profiling_info.e2e_time:.3f}s']) + [f'{comp_profiling_info.scheduling_time:.3f}s', f'{comp_profiling_info.e2e_time:.3f}s']) table.field_names = headers table.add_row(base_col) table.add_row(comp_col)