diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py index 7b1ae1a5a12ac1547123f5822e63069d719a18a6..04bbc8afa6f39aa3307b4827a2f1fbf273f30cc8 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py @@ -9,7 +9,8 @@ from compare_backend.utils.constant import Constant class GPUProfilingParser(BaseProfilingParser): CUBE_MARK = ['gemm', 'conv', 'cutlass', 'wgrad'] FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel'], ['attention', 'kernel']] - SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)'] + TENSOR_MOVE_MARK_LIST = ['htod', 'dtod', 'dtoh'] + SDMA_MARK = "memset (device)" FLOW_CAT = ("async_gpu", "async_cpu_to_gpu", "ac2g", "async") TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator", "runtime") @@ -31,7 +32,11 @@ class GPUProfilingParser(BaseProfilingParser): @classmethod def __is_sdma_time(cls, name: str): - return any(mask in name.lower() for mask in cls.SDMA_MARK_LIST) + return cls.SDMA_MARK in name.lower() + + @classmethod + def __is_tensor_move_time(cls, name: str): + return any(mark in name.lower() for mark in cls.TENSOR_MOVE_MARK_LIST) def _update_memory_list(self): if not self._enable_memory_compare: @@ -76,9 +81,11 @@ class GPUProfilingParser(BaseProfilingParser): self._result_data.overall_metrics.update_sdma_info(event.dur) self._result_data.overall_metrics.update_sdma_stream_info(event.dur) continue - if not event.is_kernel_cat(): + if not event.is_kernel_cat() or not self.__is_tensor_move_time(event.name): continue self.__add_marks(event) + if self.__is_tensor_move_time(event.name): + self._result_data.overall_metrics.update_sdma_tensor_move_info(event.dur) if event.is_nccl_name(): continue self.__add_compute_time(event, aten_events, flow_dict_new)