diff --git a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py index 7283c17b47dea78058d0541c1332df0fa45e90d9..1c5cee43e6bb01c2142112dbd9fcf1000ceecfd1 100644 --- a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py +++ b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py @@ -12,6 +12,14 @@ class OverallPerformanceComparator(BaseComparator): self._headers = [''] base_col = [f'{base_profiling_info.profiling_type}'] comp_col = [f'{comp_profiling_info.profiling_type}'] + if base_profiling_info.RDMA_bandwidth or comp_profiling_info.RDMA_bandwidth: + self._headers.extend(['RDMA Bandwidth(GB/s)']) + base_col.append(f'{base_profiling_info.RDMA_bandwidth:.3f}GB/s') + comp_col.append(f'{comp_profiling_info.RDMA_bandwidth:.3f}GB/s') + if base_profiling_info.SDMA_bandwidth or comp_profiling_info.SDMA_bandwidth: + self._headers.extend(['SDMA Bandwidth(GB/s)']) + base_col.append(f'{base_profiling_info.SDMA_bandwidth:.3f}GB/s') + comp_col.append(f'{comp_profiling_info.SDMA_bandwidth:.3f}GB/s') if not base_profiling_info.hide_op_details and not comp_profiling_info.hide_op_details: self._headers.extend(['Cube Time(Num)', 'Vector Time(Num)']) base_col.extend([f'{base_profiling_info.cube_time:.3f}s({base_profiling_info.cube_num})', diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py index e0a80a4d30d0feda38d4290667df6620855d8562..985374bd9958ec5fa6881f49ebadbc6104ed6d50 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py +++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py @@ -8,8 +8,20 @@ class ProfilingInfo: def __init__(self, profiling_type: str): self.profiling_type = profiling_type - self.cube_time = 0.0 self.other_time = 0.0 + self.lccl_num = 0 + self.compute_time = 0.0 + self.communication_not_overlapped = 0.0 + self.wait_time = 0.0 + self.memory_used = 0.0 + self.e2e_time = 0.0 + self.scheduling_time = 0.0 + self.lccl_time = 0.0 + self.minimal_profiling = False + self.hide_op_details = False + self.is_level0 = False + + self.cube_time = 0.0 self.vec_time = 0.0 self.cube_num = 0 self.vec_num = 0 @@ -17,26 +29,14 @@ class ProfilingInfo: self.fa_num_fwd = 0 self.fa_num_bwd = 0 self.pa_num = 0 - self.lccl_num = 0 self.conv_time_fwd = 0.0 self.conv_time_bwd = 0.0 self.conv_num_fwd = 0 self.conv_num_bwd = 0 - self.compute_time = 0.0 - self.communication_not_overlapped = 0.0 - self.wait_time = 0.0 - self.memory_used = 0.0 - self.e2e_time = 0.0 self.sdma_time = 0.0 - self.scheduling_time = 0.0 self.fa_time_bwd = 0.0 self.pa_time = 0.0 - self.lccl_time = 0.0 self.fa_time_fwd = 0.0 - self.minimal_profiling = False - self.hide_op_details = False - self.is_level0 = False - # 性能拆解新指标 self.fa_time_fwd_cube = 0.0 self.fa_num_fwd_cube = 0 @@ -76,7 +76,8 @@ class ProfilingInfo: self.other_cube_time = 0.0 self.other_cube_num = 0 - + self.RDMA_bandwidth = 0.0 + self.SDMA_bandwidth = 0.0 @property def e2e_time_ms(self): return self.e2e_time * 10 ** 3 @@ -137,22 +138,6 @@ class ProfilingInfo: return sum((self.vector_num_trans, self.vector_num_notrans)) def trans_time_to_s(self): - self.cube_time = self.cube_time / 10 ** 6 - self.other_time = self.other_time / 10 ** 6 - self.vec_time = self.vec_time / 10 ** 6 - self.compute_time = self.compute_time / 10 ** 6 - self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6 - self.wait_time = self.wait_time / 10 ** 6 - self.e2e_time = self.e2e_time / 10 ** 6 - self.sdma_time = self.sdma_time / 10 ** 6 - self.scheduling_time = self.scheduling_time / 10 ** 6 - self.fa_time_bwd = self.fa_time_bwd / 10 ** 6 - self.fa_time_fwd = self.fa_time_fwd / 10 ** 6 - self.pa_time = self.pa_time / 10 ** 6 - self.lccl_time = self.lccl_time / 10 ** 6 - self.conv_time_fwd = self.conv_time_fwd / 10 ** 6 - self.conv_time_bwd = self.conv_time_bwd / 10 ** 6 - # 新指标单位为ms self.fa_time_fwd_cube /= 10 ** 3 self.fa_time_bwd_cube /= 10 ** 3 @@ -171,6 +156,30 @@ class ProfilingInfo: self.page_attention_time /= 10 ** 3 self.other_cube_time /= 10 ** 3 + self.cube_time = (self.matmul_time_cube + self.matmul_time_vector + self.other_cube_time) / 1000 + self.vec_time = (self.vector_time_trans + self.vector_time_notrans) / 1000 + self.cube_num = (self.matmul_num_cube + self.matmul_num_vector + self.other_cube_num) / 1000 + self.vec_num = (self.vector_num_trans + self.vector_num_notrans) / 1000 + self.sdma_num = (self.sdma_num_tensor_move + self.sdma_num_stream) / 1000 + self.fa_num_fwd = (self.fa_num_fwd_cube + self.fa_num_fwd_vector) / 1000 + self.fa_num_bwd = (self.fa_num_bwd_cube + self.fa_num_bwd_vector) / 1000 + self.pa_num = self.page_attention_num / 1000 + self.conv_time_fwd = (self.conv_time_fwd_cube + self.conv_time_fwd_vector) / 1000 + self.conv_time_bwd = (self.conv_time_bwd_cube + self.conv_time_bwd_vector) / 1000 + self.conv_num_fwd = (self.conv_num_fwd_cube + self.conv_num_fwd_vector) / 1000 + self.conv_num_bwd = (self.conv_num_bwd_cube + self.conv_num_bwd_vector) / 1000 + self.sdma_time = (self.sdma_time_tensor_move + self.sdma_time_stream) / 1000 + self.fa_time_bwd = (self.fa_time_bwd_cube + self.fa_time_bwd_vector) / 1000 + self.pa_time = self.page_attention_time / 1000 + self.fa_time_fwd = (self.fa_time_fwd_cube + self.fa_time_fwd_vector) / 1000 + + self.other_time = self.other_time / 10 ** 6 + self.compute_time = self.compute_time / 10 ** 6 + self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6 + self.wait_time = self.wait_time / 10 ** 6 + self.e2e_time = self.e2e_time / 10 ** 6 + self.scheduling_time = self.scheduling_time / 10 ** 6 + self.lccl_time = self.lccl_time / 10 ** 6 def calculate_other_time(self): self.other_time = max( [0, self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd - @@ -183,14 +192,6 @@ class ProfilingInfo: def calculate_schedule_time(self): self.scheduling_time = (self.e2e_time - self.compute_time - self.lccl_time - self.communication_not_overlapped) - def update_fa_fwd_info(self, time: float): - self.fa_time_fwd += time - self.fa_num_fwd += 1 - - def update_fa_bwd_info(self, time: float): - self.fa_time_bwd += time - self.fa_num_bwd += 1 - def update_fa_fwd_cube_info(self, time: float): self.fa_time_fwd_cube += time self.fa_num_fwd_cube += 1 @@ -215,22 +216,10 @@ class ProfilingInfo: self.sdma_time_stream += time self.sdma_num_stream += num - def update_pa_info(self, time: float): - self.pa_time += time - self.pa_num += 1 - def update_lccl_info(self, time: float): self.lccl_time += time self.lccl_num += 1 - def update_conv_fwd_info(self, time: float): - self.conv_time_fwd += time - self.conv_num_fwd += 1 - - def update_conv_bwd_info(self, time: float): - self.conv_time_bwd += time - self.conv_num_bwd += 1 - def update_conv_bwd_cube_info(self, time: float): self.conv_time_bwd_cube += time self.conv_num_bwd_cube += 1 @@ -267,18 +256,6 @@ class ProfilingInfo: self.vector_time_notrans += time self.vector_num_notrans += 1 - def update_sdma_info(self, time: float, num: int = 1): - self.sdma_time += time - self.sdma_num += num - - def update_cube_info(self, time: float): - self.cube_time += time - self.cube_num += 1 - - def update_vec_info(self, time: float): - self.vec_time += time - self.vec_num += 1 - def update_other_cube_info(self, time: float): self.other_cube_time += time self.other_cube_num += 1 @@ -306,3 +283,9 @@ class ProfilingInfo: def is_not_minimal_profiling(self) -> bool: return self.profiling_type == Constant.NPU and not self.minimal_profiling + + def set_RDMA_bandwidth(self, bandwidth: float): + self.RDMA_bandwidth = bandwidth + + def set_SDMA_bandwidth(self, bandwidth: float): + self.SDMA_bandwidth = bandwidth \ No newline at end of file diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py index 0aeeba83efb1ec62b0cf53ced7084dcccb7aa6c8..175b77603c8b83388c34b49843e3ff1257a2414f 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py @@ -61,9 +61,9 @@ class GPUProfilingParser(BaseProfilingParser): def _update_overall_metrics(self): self._calculate_performance_time() self.__parse_memory_reserved() + self._result_data.overall_metrics.trans_time_to_s() self._result_data.overall_metrics.calculate_vec_time() self._result_data.overall_metrics.calculate_schedule_time() - self._result_data.overall_metrics.trans_time_to_s() def _calculate_performance_time(self): min_ts = sys.float_info.max @@ -76,7 +76,6 @@ class GPUProfilingParser(BaseProfilingParser): min_ts = min(event.start_time, min_ts) max_ts = max(event.end_time, max_ts) if event.stream == self._compute_stream_id and self.__is_sdma_time(event.name): - self._result_data.overall_metrics.update_sdma_info(event.dur) self._result_data.overall_metrics.update_sdma_stream_info(event.dur) continue if not event.is_kernel_cat(): @@ -84,7 +83,6 @@ class GPUProfilingParser(BaseProfilingParser): self.__add_marks(event) if event.is_nccl_name(): continue - self.__add_compute_time(event, aten_events, flow_dict_new) self.categorize_computing_performance_data(event, flow_dict_new) self._aten_events = None self._result_data.overall_metrics.set_e2e_time(float(max_ts - min_ts)) @@ -104,23 +102,6 @@ class GPUProfilingParser(BaseProfilingParser): for timestep in range(int(event.start_time + 1), int(event.end_time + 1)): self._marks[str(timestep)] += -100 # mark this timestep in compute stream - def __add_compute_time(self, event: TraceEventBean, aten_events: list, flow_dict_new: dict): - if self.__is_flash_attention(event.name): - if event.is_backward(): - self._result_data.overall_metrics.update_fa_bwd_info(event.dur) - else: - self._result_data.overall_metrics.update_fa_fwd_info(event.dur) - elif any(cube_mark in event.lower_name for cube_mark in self.CUBE_MARK): - is_conv = self.__check_is_conv(event, aten_events, flow_dict_new) - if is_conv == "conv_fwd": - self._result_data.overall_metrics.update_conv_fwd_info(event.dur) - elif is_conv == "conv_bwd": - self._result_data.overall_metrics.update_conv_bwd_info(event.dur) - else: - self._result_data.overall_metrics.update_cube_info(event.dur) - else: - self._result_data.overall_metrics.update_vec_info(event.dur) - def __check_is_conv(self, event: TraceEventBean, aten_events: list, flow_dict_new: dict) -> str: flow_start_time = flow_dict_new.get(event.start_time) if not flow_start_time: diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py index cb25c252c6c825cb22fea63a4c1ecc82f9c61e57..3c3f054273a6d3b919c8a570d4c8898c47010b8f 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py @@ -22,6 +22,7 @@ class NPUProfilingParser(BaseProfilingParser): self._operator_memory_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "operator_memory.csv") self._memory_record_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "memory_record.csv") self._kernel_detail_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "kernel_details.csv") + self._communication_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "communication.json") self._info_json_path = path_dict.get(Constant.INFO_JSON_PATH, "") self._trace_events = [TraceEventBean(event) for event in self._trace_events] self._hccl_pid = None @@ -78,7 +79,6 @@ class NPUProfilingParser(BaseProfilingParser): print("[ERROR] Failed to enable enable_kernel_compare, type of kernel_details.csv is null.") return self._result_data.update_kernel_details(kernels_dict) - def _update_memory_list(self): try: memory_data = FileReader.read_csv_file(self._operator_memory_path, OperatorMemoryBean) @@ -121,6 +121,35 @@ class NPUProfilingParser(BaseProfilingParser): return self._dequeue_data[left].corr_id if self._dequeue_data[left].start_time <= ts_time <= \ self._dequeue_data[left].end_time else Constant.INVALID_VALUE + def _update_bandwidth(self): + try: + communication_json = FileReader.read_json_file(self._communication_path) + except FileNotFoundError: + print("[WARNING] The file communication.json does not exist.") + except Exception: + print("[ERROR] Failed to read communication.json.") + return + if not communication_json: + print("[WARNING] The JSON file is empty.") + return + for _, group_dict in communication_json.items(): + step_dict = group_dict.get("collective") + total_op_info = step_dict.get("Total Op Info", {}) + rdma_size_mb = rdma_time_ms = sdma_size_mb = sdma_time_ms = 0 + if "Communication Bandwidth Info" in total_op_info: + bandwidth_info = total_op_info["Communication Bandwidth Info"] + if "RDMA" in bandwidth_info: + rdma_info = bandwidth_info["RDMA"] + rdma_size_mb += rdma_info.get("Transit Size(MB)", 0) # 单位为 MB + rdma_time_ms += rdma_info.get("Transit Time(ms)", 0) # 单位为 MS + if "SDMA" in bandwidth_info: + sdma_info = bandwidth_info["SDMA"] + sdma_size_mb += sdma_info.get("Transit Size(MB)", 0) # 单位为 MB + sdma_time_ms += sdma_info.get("Transit Time(ms)", 0) # 单位为 MS + rdma_bandwidth = (rdma_size_mb / 1024) / (rdma_time_ms / 1000) if rdma_time_ms > 0 else 0 + sdma_bandwidth = (sdma_size_mb / 1024) / (sdma_time_ms / 1000) if sdma_time_ms > 0 else 0 + self._result_data.overall_metrics.set_RDMA_bandwidth(rdma_bandwidth) + self._result_data.overall_metrics.set_SDMA_bandwidth(sdma_bandwidth) def _update_overall_metrics(self): self.__parse_info_json() self.__parse_mem_csv() @@ -130,10 +159,11 @@ class NPUProfilingParser(BaseProfilingParser): self.__add_overlap_analysis_time() self._picking_notify_wait_event_and_not_overlap_event() self.__add_overlap_wait_time() + self._result_data.overall_metrics.trans_time_to_s() self._result_data.overall_metrics.calculate_other_time() self._result_data.overall_metrics.calculate_schedule_time() - self._result_data.overall_metrics.trans_time_to_s() + self._update_bandwidth() def _picking_notify_wait_event_and_not_overlap_event(self): self.notify_event_cache = [] self._not_overlaped_commu_event = [] @@ -271,28 +301,6 @@ class NPUProfilingParser(BaseProfilingParser): self._result_data.overall_metrics.update_lccl_info(event.dur) def __parse_kernel_csv(self): - def __screen_data(kernel: KernelDetailsBean): - if kernel.is_flash_attention(): - if kernel.is_fa_bwd(): - self._result_data.overall_metrics.update_fa_bwd_info(kernel.duration) - else: - self._result_data.overall_metrics.update_fa_fwd_info(kernel.duration) - elif kernel.is_conv(): - if kernel.is_conv_bwd(): - self._result_data.overall_metrics.update_conv_bwd_info(kernel.duration) - else: - self._result_data.overall_metrics.update_conv_fwd_info(kernel.duration) - elif kernel.is_matmul(): - self._result_data.overall_metrics.update_cube_info(kernel.duration) - elif kernel.is_sdma(): - self._result_data.overall_metrics.update_sdma_info(kernel.duration) - elif kernel.is_page_attention(): - self._result_data.overall_metrics.update_pa_info(kernel.duration) - elif kernel.is_vector(): - self._result_data.overall_metrics.update_vec_info(kernel.duration) - else: - self._result_data.overall_metrics.update_cube_info(kernel.duration) - try: kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) except Exception: @@ -306,7 +314,6 @@ class NPUProfilingParser(BaseProfilingParser): for kernel in kernel_details: if kernel.is_invalid(): continue - __screen_data(kernel) self.categorize_computing_performance_data(kernel, flow_dict_new) def __parse_mem_csv(self): @@ -353,5 +360,4 @@ class NPUProfilingParser(BaseProfilingParser): compute_stream = event_wait_stream & ai_core_stream if event_wait_stream else ai_core_stream for stream in compute_stream: dur_list = sdma_dict.get(stream, []) - self._result_data.overall_metrics.update_sdma_info(sum(dur_list), len(dur_list)) self._result_data.overall_metrics.update_sdma_stream_info(sum(dur_list), len(dur_list)) diff --git a/profiler/compare_tools/compare_backend/utils/constant.py b/profiler/compare_tools/compare_backend/utils/constant.py index 252aa536e1c73d58f86071bbefab5286004bb6f9..80d7d5ee4f9a0834831e8c81bc61e299caf42bee 100644 --- a/profiler/compare_tools/compare_backend/utils/constant.py +++ b/profiler/compare_tools/compare_backend/utils/constant.py @@ -6,6 +6,7 @@ class Constant(object): MAX_PATH_LENGTH = 4096 MAX_FLOW_CAT_LEN = 20 MAX_FILE_SIZE = 1024 * 1024 * 1024 * 5 + MAX_JSON_SIZE = 1024 * 1024 * 1024 * 10 BYTE_TO_KB = 1024 YELLOW_COLOR = "FFFF00" GREEN_COLOR = "00FF00" diff --git a/profiler/compare_tools/compare_backend/utils/file_reader.py b/profiler/compare_tools/compare_backend/utils/file_reader.py index b4ae786388b2f1bed6ad50cfb39ac8621c1ea1f1..99358368cb1d45991da755d115d7542361c24a54 100644 --- a/profiler/compare_tools/compare_backend/utils/file_reader.py +++ b/profiler/compare_tools/compare_backend/utils/file_reader.py @@ -7,7 +7,28 @@ from compare_backend.utils.constant import Constant class FileReader: - + @classmethod + def read_json_file(cls, file_path: str, bean_class: any = None) -> any: + PathManager.check_path_readable(file_path) + if not os.path.isfile(file_path): + raise FileNotFoundError("File not exists.") + file_size = os.path.getsize(file_path) + if file_size <= 0: + return [] + if file_size > Constant.MAX_JSON_SIZE: + check_msg = input( + f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") + if check_msg.lower() != "y": + print(f"[WARNING] The user choose not to read the file: {file_path}") + return [] + result_data = [] + try: + with open(file_path, "r") as json_file: + result_data = json.loads(json_file.read()) + except Exception as e: + msg = f"Failed to read the file: {file_path}" + raise RuntimeError(msg) from e + return result_data @classmethod def read_trace_file(cls, file_path: str) -> any: PathManager.check_path_readable(file_path)