diff --git a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py index bfc631c66c86f061b10445e117e9f947d7ebdbc5..7ad66c0fa7d8f42c6078bd1d04f2164ebda0e9d8 100644 --- a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py +++ b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py @@ -37,9 +37,12 @@ class OverallPerformanceComparator(BaseComparator): self._headers.append('Mem Usage') base_col.append(f'{base_profiling_info.memory_used:.2f}G') comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') - self._headers.extend(['Uncovered Communication Time']) - base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s']) - comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s']) + self._headers.extend(['Uncovered Communication Time(Wait Time)']) + if base_profiling_info.wait_time: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s({base_profiling_info.wait_time:.3f}s']) + else: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s( / )']) + comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s({comp_profiling_info.wait_time:.3f}s)']) if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: self._headers.append('SDMA Time(Num)') base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py index 9184c790b7ea59246b602442a13e7e533d921bc8..b100e7ba9877d6bf4c316b590afc3a37a9346070 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py +++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py @@ -18,6 +18,7 @@ class ProfilingInfo: self.fa_num_bwd = 0 self.compute_time = 0.0 self.communication_not_overlapped = 0.0 + self.wait_time = 0.0 self.memory_used = 0.0 self.e2e_time = 0.0 self.sdma_time = 0.0 @@ -33,6 +34,7 @@ class ProfilingInfo: self.vec_time = self.vec_time / 10 ** 6 self.compute_time = self.compute_time / 10 ** 6 self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6 + self.wait_time = self.wait_time / 10 ** 6 self.e2e_time = self.e2e_time / 10 ** 6 self.sdma_time = self.sdma_time / 10 ** 6 self.scheduling_time = self.scheduling_time / 10 ** 6 @@ -84,6 +86,9 @@ class ProfilingInfo: def update_comm_not_overlap(self, time: float): self.communication_not_overlapped += time + def update_comm_not_overlap_wait_time(self, time: float): + self.wait_time = time + def set_memory_used(self, memory: float): self.memory_used = memory diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py index f872e52a5314a40dbc2e0d4ff7868e875986b809..dfc7d7d4363d1feee7221a930ce6e08350ddb47b 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py @@ -48,6 +48,7 @@ class NPUProfilingParser(BaseProfilingParser): if self._enable_profiling_compare: func_list.add(self._picking_overlap_analysis_data) func_list.add(self._picking_kernel_event) + func_list.add(self._picking_hccl_event) return list(func_list) def _update_memory_list(self): @@ -98,10 +99,65 @@ class NPUProfilingParser(BaseProfilingParser): self.__parse_kernel_csv() self.__add_sdma_time() self.__add_overlap_analysis_time() + self._picking_notify_wait_event_and_not_overlap_event() + self.__add_overlap_wait_time() self._result_data.overall_metrics.calculate_other_time() self._result_data.overall_metrics.calculate_schedule_time() self._result_data.overall_metrics.trans_time_to_s() + def _picking_notify_wait_event_and_not_overlap_event(self): + self.notify_event_cache = [] + self._not_overlaped_commu_event = [] + for event in self._commu_task_list: + if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK': + self.notify_event_cache.append(event) + for event in self._overlap_analysis: + if event.is_comm_not_overlap(): + self._not_overlaped_commu_event.append(event) + + def __add_overlap_wait_time(self): + notify_wait_event_dict = dict() + for notify_event in self.notify_event_cache: + if notify_event.tid in notify_wait_event_dict: + notify_wait_event_dict[notify_event.tid].append(notify_event) + else: + notify_wait_event_dict[notify_event.tid] = [notify_event] + total_time = 0 + for commu_event in self._not_overlaped_commu_event: + commu_event_start_time = float(commu_event.start_time) + commu_event_end_time = float(commu_event.end_time) + wait_time_list = [] + + for plane_id, events in notify_wait_event_dict.items(): + wait_time = 0 + idx = 0 + for notify_event in events: + notify_event_start_time = float(notify_event.start_time) + notify_event_end_time = float(notify_event.end_time) + if notify_event_start_time < commu_event_start_time and notify_event_end_time > commu_event_end_time: + wait_time = commu_event_end_time - commu_event_start_time + break + elif notify_event_start_time < commu_event_start_time and commu_event_start_time <= notify_event_end_time <= commu_event_end_time: + wait_time += notify_event_end_time - commu_event_start_time + idx += 1 + elif commu_event_start_time <= notify_event_start_time <= commu_event_end_time and notify_event_end_time > commu_event_end_time: + wait_time += commu_event_end_time - commu_event_start_time + break + elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= commu_event_end_time: + wait_time += notify_event_end_time - notify_event_start_time + elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= commu_event_end_time: + wait_time += notify_event_end_time - notify_event_start_time + idx += 1 + elif notify_event_end_time < commu_event_start_time: + idx += 1 + else: + break + + wait_time_list.append(wait_time) + notify_wait_event_dict[plane_id] = notify_wait_event_dict[plane_id][idx:] + total_time += max(wait_time_list) + self._result_data.overall_metrics.update_comm_not_overlap(total_time) + def _picking_hccl_event(self, event: TraceEventBean): if event.pid != self._hccl_pid or not event.is_x_mode(): return False