From 51bb59fe8b57621123a8c97c69f32ad9bf0cc79f Mon Sep 17 00:00:00 2001 From: zhouxianqi <13165993773@163.com> Date: Wed, 13 Dec 2023 20:10:06 +0800 Subject: [PATCH] code_refactoring --- .../comparator/base_comparator.py | 24 ++ .../comparator/communication_comparator.py | 20 + .../comparator/index_comparator.py | 44 -- .../comparator/operator_comparator.py | 13 + .../operator_statistic_generator.py | 28 ++ .../overall_performance_comparator.py | 59 +++ .../{generation => compare_bean}/__init__.py | 0 .../compare_bean/communication_bean.py | 72 ++++ .../compare_bean/memory_compare_bean.py | 51 +++ .../compare_bean/memory_statistic_bean.py | 38 ++ .../compare_bean/operator_compare_bean.py | 51 +++ .../compare_bean/operator_statistic_bean.py | 36 ++ .../compare_bean/origin_data_bean/__init__.py | 0 .../origin_data_bean}/compare_event.py | 0 .../origin_data_bean/trace_event_bean.py | 134 ++++++ .../profiling_info.py} | 9 +- .../generation/base_generator.py | 11 - .../communication_compare_generator.py | 50 --- .../generation/comparison_generator.py | 33 -- .../generation/memory_compare_generator.py | 37 -- .../generation/memory_statistic_generator.py | 53 --- .../generation/operator_compare_generator.py | 39 -- .../operator_statistic_generator.py | 43 -- profiler/compare_tools/generator/__init__.py | 0 .../compare_tools/generator/base_generator.py | 22 + .../generator/comparison_generator.py | 36 ++ .../detail_performance_generator.py} | 83 ++-- .../overall_performance_generator.py | 19 + profiler/compare_tools/performance_compare.py | 26 +- .../profiling_analysis/__init__.py | 14 - .../profiling_analysis/gpu_parser.py | 213 ---------- .../profiling_analysis/npu_parser.py | 293 -------------- .../profiling_analysis/profiling_parse.py | 131 ------ .../profiling_parser/__init__.py | 0 .../profiling_parser/base_profiling_parser.py | 116 ++++++ .../profiling_parser/gpu_profiling_parser.py | 215 ++++++++++ .../profiling_parser/npu_profiling_parser.py | 382 ++++++++++++++++++ profiler/compare_tools/utils/args_manager.py | 70 ++-- profiler/compare_tools/utils/common_func.py | 6 + profiler/compare_tools/utils/constant.py | 30 +- profiler/compare_tools/utils/excel_config.py | 152 ++++--- .../compare_tools/utils/profiling_parser.py | 300 -------------- profiler/compare_tools/utils/torch_op_node.py | 2 +- .../compare_tools/utils/trace_event_data.py | 42 -- profiler/compare_tools/view/base_view.py | 10 + profiler/compare_tools/view/excel_view.py | 10 +- profiler/compare_tools/view/screen_view.py | 19 + .../compare_tools/view/work_sheet_creator.py | 63 ++- 48 files changed, 1591 insertions(+), 1508 deletions(-) create mode 100644 profiler/compare_tools/comparator/base_comparator.py create mode 100644 profiler/compare_tools/comparator/communication_comparator.py delete mode 100644 profiler/compare_tools/comparator/index_comparator.py create mode 100644 profiler/compare_tools/comparator/operator_comparator.py create mode 100644 profiler/compare_tools/comparator/operator_statistic_generator.py create mode 100644 profiler/compare_tools/comparator/overall_performance_comparator.py rename profiler/compare_tools/{generation => compare_bean}/__init__.py (100%) create mode 100644 profiler/compare_tools/compare_bean/communication_bean.py create mode 100644 profiler/compare_tools/compare_bean/memory_compare_bean.py create mode 100644 profiler/compare_tools/compare_bean/memory_statistic_bean.py create mode 100644 profiler/compare_tools/compare_bean/operator_compare_bean.py create mode 100644 profiler/compare_tools/compare_bean/operator_statistic_bean.py create mode 100644 profiler/compare_tools/compare_bean/origin_data_bean/__init__.py rename profiler/compare_tools/{utils => compare_bean/origin_data_bean}/compare_event.py (100%) create mode 100644 profiler/compare_tools/compare_bean/origin_data_bean/trace_event_bean.py rename profiler/compare_tools/{profiling_analysis/parser_helper.py => compare_bean/profiling_info.py} (91%) delete mode 100644 profiler/compare_tools/generation/base_generator.py delete mode 100644 profiler/compare_tools/generation/communication_compare_generator.py delete mode 100644 profiler/compare_tools/generation/comparison_generator.py delete mode 100644 profiler/compare_tools/generation/memory_compare_generator.py delete mode 100644 profiler/compare_tools/generation/memory_statistic_generator.py delete mode 100644 profiler/compare_tools/generation/operator_compare_generator.py delete mode 100644 profiler/compare_tools/generation/operator_statistic_generator.py create mode 100644 profiler/compare_tools/generator/__init__.py create mode 100644 profiler/compare_tools/generator/base_generator.py create mode 100644 profiler/compare_tools/generator/comparison_generator.py rename profiler/compare_tools/{comparator/op_comparator.py => generator/detail_performance_generator.py} (59%) create mode 100644 profiler/compare_tools/generator/overall_performance_generator.py delete mode 100644 profiler/compare_tools/profiling_analysis/__init__.py delete mode 100644 profiler/compare_tools/profiling_analysis/gpu_parser.py delete mode 100644 profiler/compare_tools/profiling_analysis/npu_parser.py delete mode 100644 profiler/compare_tools/profiling_analysis/profiling_parse.py create mode 100644 profiler/compare_tools/profiling_parser/__init__.py create mode 100644 profiler/compare_tools/profiling_parser/base_profiling_parser.py create mode 100644 profiler/compare_tools/profiling_parser/gpu_profiling_parser.py create mode 100644 profiler/compare_tools/profiling_parser/npu_profiling_parser.py delete mode 100644 profiler/compare_tools/utils/profiling_parser.py delete mode 100644 profiler/compare_tools/utils/trace_event_data.py create mode 100644 profiler/compare_tools/view/base_view.py create mode 100644 profiler/compare_tools/view/screen_view.py diff --git a/profiler/compare_tools/comparator/base_comparator.py b/profiler/compare_tools/comparator/base_comparator.py new file mode 100644 index 0000000000..b8bf06e44a --- /dev/null +++ b/profiler/compare_tools/comparator/base_comparator.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod + + +class BaseComparator(ABC): + def __init__(self, origin_data: any, bean: any): + self._sheet_name = bean.TABLE_NAME + self._headers = bean.HEADERS + self._overhead = bean.OVERHEAD + self._origin_data = origin_data + self._bean = bean + self._rows = [] + + def generate_data(self) -> dict: + ''' + generate one sheet(table) data + type: dict + sheet name as the dict key + ''' + self._compare() + return {self._sheet_name: {"headers": self._headers, "rows": self._rows, "overhead": self._overhead}} + + @abstractmethod + def _compare(self): + raise NotImplementedError("Function generate_data need to be implemented.") diff --git a/profiler/compare_tools/comparator/communication_comparator.py b/profiler/compare_tools/comparator/communication_comparator.py new file mode 100644 index 0000000000..64d8d5e758 --- /dev/null +++ b/profiler/compare_tools/comparator/communication_comparator.py @@ -0,0 +1,20 @@ +from comparator.base_comparator import BaseComparator +from compare_bean.communication_bean import CommunicationBean +from utils.constant import Constant +from utils.common_func import update_order_id + + +class CommunicationComparator(BaseComparator): + def __init__(self, origin_data: dict, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + base_data = self._origin_data.get(Constant.BASE_DATA, {}).get(Constant.COMMUNICATION_DICT, {}) + comparison_data = self._origin_data.get(Constant.COMPARISON_DATA, {}).get(Constant.COMMUNICATION_DICT, {}) + for comm_name, comm_data in base_data.items(): + comparison_comm_data = comparison_data.pop(comm_name, {}) + self._rows.extend(CommunicationBean(comm_name, comm_data, comparison_comm_data).rows) + for comm_name, comm_data in comparison_data.items(): + self._rows.extend(CommunicationBean(comm_name, {}, comm_data).rows) + update_order_id(self._rows) + diff --git a/profiler/compare_tools/comparator/index_comparator.py b/profiler/compare_tools/comparator/index_comparator.py deleted file mode 100644 index 91b050548d..0000000000 --- a/profiler/compare_tools/comparator/index_comparator.py +++ /dev/null @@ -1,44 +0,0 @@ -from utils.args_manager import ArgsManager - - -class IndexComparator: - def __init__(self, args: any): - self._args = args - self._args_manager = ArgsManager() - self._base_profiling = self._args_manager.base_profiling - self._comparison_profiling = self._args_manager.comparison_profiling - - def compare(self) -> list: - base_data_dict, comparison_data_dict = {}, {} - if not self._base_profiling.communication_data: - print(f"[WARNING] Can't find any communication op in the file: {self._base_profiling.json_path}") - for data in self._base_profiling.communication_data: - name_list = data.get("name", "").split("_") - if len(name_list) >= 2: - base_data_dict.setdefault(name_list[1].lower(), []).append(float(data.get("dur", 0))) - if self._args.base_profiling_path != self._args.comparison_profiling_path: - if not self._comparison_profiling.communication_data: - print(f"[WARNING] Can't find any communication op in the file: {self._comparison_profiling.json_path}") - for data in self._comparison_profiling.communication_data: - name_list = data.get("name", "").split("_") - if len(name_list) >= 2: - comparison_data_dict.setdefault(name_list[1].lower(), []).append(float(data.get("dur", 0))) - result_data = [] - for name, base_dur_list in base_data_dict.items(): - base_row = [name, None, len(base_dur_list), sum(base_dur_list), sum(base_dur_list) / len(base_dur_list), - max(base_dur_list), min(base_dur_list)] - if self._args.base_profiling_path == self._args.comparison_profiling_path: - result_data.append(base_row + [None] * 7) - continue - com_dur_list = comparison_data_dict.pop(name, None) - if not com_dur_list: - com_row = [None, None, None, 0, None, None, None] - else: - com_row = [name, None, len(com_dur_list), sum(com_dur_list), sum(com_dur_list) / len(com_dur_list), - max(com_dur_list), min(com_dur_list)] - result_data.append(base_row + com_row) - for name, com_dur_list in comparison_data_dict.items(): - com_row = [name, None, len(com_dur_list), sum(com_dur_list), sum(com_dur_list) / len(com_dur_list), - max(com_dur_list), min(com_dur_list)] - result_data.append([None, None, None, 0, None, None, None] + com_row) - return result_data diff --git a/profiler/compare_tools/comparator/operator_comparator.py b/profiler/compare_tools/comparator/operator_comparator.py new file mode 100644 index 0000000000..d7b22af577 --- /dev/null +++ b/profiler/compare_tools/comparator/operator_comparator.py @@ -0,0 +1,13 @@ +from comparator.base_comparator import BaseComparator + + +class OperatorComparator(BaseComparator): + def __init__(self, origin_data: any, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + self._rows = [None] * (len(self._origin_data)) + for index, (base_op, comparison_op) in enumerate(self._origin_data): + self._rows[index] = self._bean(index, base_op, comparison_op).row diff --git a/profiler/compare_tools/comparator/operator_statistic_generator.py b/profiler/compare_tools/comparator/operator_statistic_generator.py new file mode 100644 index 0000000000..8ae1285abf --- /dev/null +++ b/profiler/compare_tools/comparator/operator_statistic_generator.py @@ -0,0 +1,28 @@ +from comparator.base_comparator import BaseComparator +from utils.common_func import update_order_id + + +class OperatorStatisticComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_op_dict, comparison_op_dict = self._group_by_op_name() + for op_name, base_data in base_op_dict.items(): + comparison_data = comparison_op_dict.pop(op_name, []) + self._rows.append(self._bean(op_name, base_data, comparison_data).row) + for op_name, comparison_data in comparison_op_dict.items(): + self._rows.append(self._bean(op_name, [], comparison_data).row) + self._rows.sort(key=lambda x: x[-2], reverse=True) + update_order_id(self._rows) + + def _group_by_op_name(self): + base_op_dict, comparison_op_dict = {}, {} + for base_op, comparison_op in self._origin_data: + if base_op: + base_op_dict.setdefault(base_op.name, []).append(base_op) + if comparison_op: + comparison_op_dict.setdefault(comparison_op.name, []).append(comparison_op) + return base_op_dict, comparison_op_dict diff --git a/profiler/compare_tools/comparator/overall_performance_comparator.py b/profiler/compare_tools/comparator/overall_performance_comparator.py new file mode 100644 index 0000000000..d973e5d850 --- /dev/null +++ b/profiler/compare_tools/comparator/overall_performance_comparator.py @@ -0,0 +1,59 @@ +from comparator.base_comparator import BaseComparator +from utils.constant import Constant + + +class OverallPerformanceComparator(BaseComparator): + def __init__(self, origin_data: dict, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + base_profiling_info = self._origin_data.get(Constant.BASE_DATA) + comp_profiling_info = self._origin_data.get(Constant.COMPARISON_DATA) + self._headers = [''] + base_col = [f'{base_profiling_info.profiling_type}'] + comp_col = [f'{comp_profiling_info.profiling_type}'] + if not base_profiling_info.hide_op_details and not comp_profiling_info.hide_op_details: + self._headers.extend(['Cube Time(Num)', 'Vector Time(Num)']) + base_col.extend([f'{base_profiling_info.cube_time:.3f}s({base_profiling_info.cube_num})', + f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})']) + comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})', + f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})']) + if base_profiling_info.other_time or comp_profiling_info.other_time: + self._headers.append('Other Time') + base_col.append(f'{base_profiling_info.other_time:.3f}s') + comp_col.append(f'{comp_profiling_info.other_time:.3f}s') + if base_profiling_info.flash_attention_time_fwd or comp_profiling_info.flash_attention_time_fwd: + self._headers.append('Flash Attention Time(Forward)(Num)') + base_col.append(f'{base_profiling_info.flash_attention_time_fwd:.3f}s({base_profiling_info.fa_num_fwd})') + comp_col.append(f'{comp_profiling_info.flash_attention_time_fwd:.3f}s({comp_profiling_info.fa_num_fwd})') + if base_profiling_info.flash_attention_time_bwd or comp_profiling_info.flash_attention_time_bwd: + self._headers.append('Flash Attention Time(Backward)(Num)') + base_col.append(f'{base_profiling_info.flash_attention_time_bwd:.3f}s({base_profiling_info.fa_num_bwd})') + comp_col.append(f'{comp_profiling_info.flash_attention_time_bwd:.3f}s({comp_profiling_info.fa_num_bwd})') + self._headers.extend(['Computing Time']) + base_col.extend([f'{base_profiling_info.compute_time:.3f}s']) + comp_col.extend([f'{comp_profiling_info.compute_time:.3f}s']) + if base_profiling_info.memory_used or comp_profiling_info.memory_used: + self._headers.append('Mem Usage') + base_col.append(f'{base_profiling_info.memory_used:.2f}G') + comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') + self._headers.extend(['Uncovered Communication Time']) + base_col.extend( + [f'{base_profiling_info.communication_not_overlapped: .3f}s']) + comp_col.extend( + [f'{comp_profiling_info.communication_not_overlapped: .3f}s']) + if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: + self._headers.append('SDMA Time(Num)') + base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') + comp_col.append(f'{comp_profiling_info.sdma_time:.3f}s({comp_profiling_info.sdma_num})') + cue = '' + if ((base_profiling_info.profiling_type == Constant.NPU and not base_profiling_info.minimal_profiling) or + (comp_profiling_info.profiling_type == Constant.NPU and not comp_profiling_info.minimal_profiling)): + cue = '(Not minimal profiling)' + + self._headers.extend(['Free Time', 'E2E Time' + cue]) + base_col.extend( + [f'{base_profiling_info.scheduling_time:.3f}s', f'{base_profiling_info.e2e_time:.3f}s']) + comp_col.extend( + [f'{comp_profiling_info.scheduling_time:.3f}s', f'{comp_profiling_info.e2e_time:.3f}s']) + self._rows = [base_col, comp_col] diff --git a/profiler/compare_tools/generation/__init__.py b/profiler/compare_tools/compare_bean/__init__.py similarity index 100% rename from profiler/compare_tools/generation/__init__.py rename to profiler/compare_tools/compare_bean/__init__.py diff --git a/profiler/compare_tools/compare_bean/communication_bean.py b/profiler/compare_tools/compare_bean/communication_bean.py new file mode 100644 index 0000000000..24ff45f021 --- /dev/null +++ b/profiler/compare_tools/compare_bean/communication_bean.py @@ -0,0 +1,72 @@ +from utils.constant import Constant +from utils.excel_config import ExcelConfig +from utils.common_func import calculate_diff_ratio + + +class CommunicationInfo: + + def __init__(self, name: str, data_list: list, is_task: bool): + self.comm_op_name = None + self.task_name = None + self.calls = None + self.total_duration = None + self.avg_duration = None + self.max_duration = None + self.min_duration = None + if data_list: + self.comm_op_name = "|" if is_task else name + self.task_name = name if is_task else None + self.calls = len(data_list) + self.total_duration = sum(data_list) + self.avg_duration = sum(data_list) / len(data_list) + self.max_duration = max(data_list) + self.min_duration = min(data_list) + + +class CommunicationBean: + TABLE_NAME = Constant.COMMUNICATION_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_comm_data: dict, comparison_comm_data: dict): + self._name = name + self._base_comm = base_comm_data + self._comparison_comm = comparison_comm_data + + @property + def rows(self): + rows = [] + base_comm = CommunicationInfo(self._name, self._base_comm.get("comm_list", []), is_task=False) + comparison_comm = CommunicationInfo(self._name, self._comparison_comm.get("comm_list", []), is_task=False) + rows.append(self._get_row(base_comm, comparison_comm, is_task=False)) + + base_task = self._base_comm.get("comm_task", {}) + comparison_task = self._comparison_comm.get("comm_task", {}) + if not base_task and not comparison_task: + return rows + + for task_name, task_list in base_task.items(): + base_task_info = CommunicationInfo(task_name, task_list, is_task=True) + comparison_task_info = CommunicationInfo("", [], is_task=True) + for _task_name, _task_list in comparison_task.items(): + comparison_task_info = CommunicationInfo(_task_name, _task_list, is_task=True) + comparison_task.pop(_task_name, None) + break + rows.append(self._get_row(base_task_info, comparison_task_info, is_task=True)) + for task_name, task_list in comparison_task.items(): + base_task_info = CommunicationInfo("", [], is_task=True) + comparison_task_info = CommunicationInfo(task_name, task_list, is_task=True) + rows.append(self._get_row(base_task_info, comparison_task_info, is_task=True)) + + return rows + + @classmethod + def _get_row(cls, base_info: CommunicationInfo, comparison_info: CommunicationInfo, is_task: bool) -> list: + row = [None, base_info.comm_op_name, base_info.task_name, base_info.calls, base_info.total_duration, + base_info.avg_duration, base_info.max_duration, base_info.min_duration, comparison_info.comm_op_name, + comparison_info.task_name, comparison_info.calls, comparison_info.total_duration, + comparison_info.avg_duration, comparison_info.max_duration, comparison_info.min_duration] + diff_fields = [None, None] if is_task else calculate_diff_ratio(base_info.total_duration, + comparison_info.total_duration) + row.extend(diff_fields) + return row diff --git a/profiler/compare_tools/compare_bean/memory_compare_bean.py b/profiler/compare_tools/compare_bean/memory_compare_bean.py new file mode 100644 index 0000000000..60cdb46608 --- /dev/null +++ b/profiler/compare_tools/compare_bean/memory_compare_bean.py @@ -0,0 +1,51 @@ +from utils.common_func import calculate_diff_ratio +from utils.constant import Constant +from utils.excel_config import ExcelConfig +from utils.torch_op_node import TorchOpNode +from utils.tree_builder import TreeBuilder + + +class MemoryCompareBean: + TABLE_NAME = Constant.MEMORY_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, index: int, base_op: TorchOpNode, comparison_op: TorchOpNode): + self._index = index + self._base_op = MemoryInfo(base_op) + self._comparison_op = MemoryInfo(comparison_op) + + @property + def row(self): + row = [self._index + 1, self._base_op.operator_name, self._base_op.input_shape, self._base_op.input_type, + self._base_op.memory_details, self._base_op.size, self._comparison_op.operator_name, + self._comparison_op.input_shape, self._comparison_op.input_type, self._comparison_op.memory_details, + self._comparison_op.size] + diff_fields = calculate_diff_ratio(self._base_op.size, self._comparison_op.size) + row.extend(diff_fields) + return row + + +class MemoryInfo: + def __init__(self, torch_op: TorchOpNode): + self.operator_name = None + self.input_shape = None + self.input_type = None + self.size = 0 + self.memory_details = None + self._memory_list = [] + if torch_op: + self.operator_name = torch_op.name + self.input_shape = torch_op.input_shape + self.input_type = torch_op.input_type + self._memory_list = TreeBuilder.get_total_memory(torch_op) + if self._memory_list: + self.size, self.memory_details = self._get_memory_fields() + + def _get_memory_fields(self): + size = 0 + memory_details = "" + for memory in self._memory_list: + size += memory.size + memory_details += memory.memory_details + return size, memory_details diff --git a/profiler/compare_tools/compare_bean/memory_statistic_bean.py b/profiler/compare_tools/compare_bean/memory_statistic_bean.py new file mode 100644 index 0000000000..827f095704 --- /dev/null +++ b/profiler/compare_tools/compare_bean/memory_statistic_bean.py @@ -0,0 +1,38 @@ +from utils.common_func import calculate_diff_ratio +from utils.constant import Constant +from utils.tree_builder import TreeBuilder +from utils.excel_config import ExcelConfig + + +class MemoryStatisticBean: + TABLE_NAME = Constant.MEMORY_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._name = name + self._base_info = MemoryStatisticInfo(base_data) + self._comparison_info = MemoryStatisticInfo(comparison_data) + + @property + def row(self): + row = [None, self._name, self._base_info.duration_ms, self._base_info.size_mb, self._base_info.number, + self._comparison_info.duration_ms, self._comparison_info.size_mb, self._comparison_info.number] + diff_fields = calculate_diff_ratio(self._base_info.size_mb, self._comparison_info.size_mb) + row.extend(diff_fields) + return row + + +class MemoryStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.duration_ms = 0 + self.size_mb = 0 + self.number = len(data_list) + self._get_info() + + def _get_info(self): + for op_data in self._data_list: + memory_list = TreeBuilder.get_total_memory(op_data) + self.duration_ms += sum([memory.duration / Constant.US_TO_MS for memory in memory_list]) + self.size_mb += sum([memory.size / Constant.KB_TO_MB for memory in memory_list]) diff --git a/profiler/compare_tools/compare_bean/operator_compare_bean.py b/profiler/compare_tools/compare_bean/operator_compare_bean.py new file mode 100644 index 0000000000..fc8e01dfdb --- /dev/null +++ b/profiler/compare_tools/compare_bean/operator_compare_bean.py @@ -0,0 +1,51 @@ +from utils.common_func import calculate_diff_ratio +from utils.constant import Constant +from utils.excel_config import ExcelConfig +from utils.torch_op_node import TorchOpNode +from utils.tree_builder import TreeBuilder + + +class OperatorCompareBean: + TABLE_NAME = Constant.OPERATOR_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, index: int, base_op: TorchOpNode, comparison_op: TorchOpNode): + self._index = index + self._base_op = OperatorInfo(base_op) + self._comparison_op = OperatorInfo(comparison_op) + + @property + def row(self): + row = [self._index + 1, self._base_op.operator_name, self._base_op.input_shape, self._base_op.input_type, + self._base_op.kernel_details, self._base_op.device_dur, self._comparison_op.operator_name, + self._comparison_op.input_shape, self._comparison_op.input_type, self._comparison_op.kernel_details, + self._comparison_op.device_dur] + diff_fields = calculate_diff_ratio(self._base_op.device_dur, self._comparison_op.device_dur) + row.extend(diff_fields) + return row + + +class OperatorInfo: + def __init__(self, torch_op: TorchOpNode): + self.operator_name = None + self.input_shape = None + self.input_type = None + self.device_dur = 0 + self.kernel_details = None + self._kernel_list = [] + if torch_op: + self.operator_name = torch_op.name + self.input_shape = torch_op.input_shape + self.input_type = torch_op.input_type + self._kernel_list = TreeBuilder.get_total_kernels(torch_op) + if self._kernel_list: + self.device_dur, self.kernel_details = self._get_kernel_fields() + + def _get_kernel_fields(self): + device_dur = 0 + kernel_details = "" + for kernel in self._kernel_list: + device_dur += kernel.device_dur + kernel_details += kernel.kernel_details + return device_dur, kernel_details diff --git a/profiler/compare_tools/compare_bean/operator_statistic_bean.py b/profiler/compare_tools/compare_bean/operator_statistic_bean.py new file mode 100644 index 0000000000..6aab6ecfe9 --- /dev/null +++ b/profiler/compare_tools/compare_bean/operator_statistic_bean.py @@ -0,0 +1,36 @@ +from utils.common_func import calculate_diff_ratio +from utils.constant import Constant +from utils.excel_config import ExcelConfig +from utils.tree_builder import TreeBuilder + + +class OperatorStatisticBean: + TABLE_NAME = Constant.OPERATOR_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._name = name + self._base_info = OperatorStatisticInfo(base_data) + self._comparison_info = OperatorStatisticInfo(comparison_data) + + @property + def row(self): + row = [None, self._name, self._base_info.device_dur_ms, self._base_info.number, + self._comparison_info.device_dur_ms, self._comparison_info.number] + diff_fields = calculate_diff_ratio(self._base_info.device_dur_ms, self._comparison_info.device_dur_ms) + row.extend(diff_fields) + return row + + +class OperatorStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.device_dur_ms = 0 + self.number = len(data_list) + self._get_info() + + def _get_info(self): + for op_data in self._data_list: + kernel_list = TreeBuilder.get_total_kernels(op_data) + self.device_dur_ms += sum([kernel.device_dur / Constant.US_TO_MS for kernel in kernel_list]) diff --git a/profiler/compare_tools/compare_bean/origin_data_bean/__init__.py b/profiler/compare_tools/compare_bean/origin_data_bean/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/profiler/compare_tools/utils/compare_event.py b/profiler/compare_tools/compare_bean/origin_data_bean/compare_event.py similarity index 100% rename from profiler/compare_tools/utils/compare_event.py rename to profiler/compare_tools/compare_bean/origin_data_bean/compare_event.py diff --git a/profiler/compare_tools/compare_bean/origin_data_bean/trace_event_bean.py b/profiler/compare_tools/compare_bean/origin_data_bean/trace_event_bean.py new file mode 100644 index 0000000000..ea378bc3ab --- /dev/null +++ b/profiler/compare_tools/compare_bean/origin_data_bean/trace_event_bean.py @@ -0,0 +1,134 @@ +class TraceEventBean: + + def __init__(self, event: dict): + self._event = event + self._pid = 0 + self._tid = 0 + self._ts = 0 + self._dur = 0 + self._ph = "" + self._cat = "" + self._name = "" + self._args = {} + self.init() + + @property + def pid(self) -> int: + return self._pid + + @property + def tid(self) -> int: + return self._tid + + @property + def dur(self) -> float: + return self._dur + + @property + def start_time(self) -> float: + return self._ts + + @property + def end_time(self) -> float: + return self._ts + self._dur + + @property + def name(self) -> str: + return self._name + + @property + def lower_name(self) -> str: + return self._name.lower() + + @property + def lower_cat(self) -> str: + return self._cat.lower() + + @property + def args(self) -> str: + return self._args + + @property + def id(self) -> str: + return self._event.get("id") + + @property + def stream_id(self) -> int: + return self._args.get('Stream Id') + + @property + def stream(self) -> int: + return self._args.get("stream") + + @property + def task_type(self) -> int: + return self._args.get('Task Type') + + @property + def device_id(self) -> int: + return self._args.get('Device Id', -1) + + @property + def total_reserved(self): + return self._args.get('Total Reserved', 0) + + @property + def corr_id(self) -> int: + return self._args.get('correlation_id') + + @property + def process_name(self) -> int: + return self._args.get("name", "") + + @property + def event(self) -> dict: + return self._event + + def is_m_mode(self) -> bool: + return self._ph == "M" + + def is_x_mode(self) -> bool: + return self._ph == "X" + + def is_flow_start(self) -> bool: + return self._ph == "s" + + def is_flow_end(self) -> bool: + return self._ph == "f" + + def is_process_meta(self) -> bool: + return self.is_m_mode() and self._name == "process_name" + + def is_thread_meta(self) -> bool: + return self.is_m_mode() and self._name == "thread_name" + + def is_communication_op_thread(self) -> bool: + return self._args.get("name", "").find("Communication") != -1 + + def is_hccl_process_name(self) -> bool: + return self.process_name == "HCCL" + + def is_npu_process_name(self) -> bool: + return self.process_name == "Ascend Hardware" + + def is_computing_event(self): + return self._name == "Computing" + + def is_comm_not_overlap(self): + return self._name == 'Communication(Not Overlapped)' + + def is_valid_event(self): + return self._name and self._cat and self._dur and self._ts + + def is_dict(self): + return isinstance(self._event, dict) + + def init(self): + self._pid = self._event.get("pid", 0) + self._tid = self._event.get("tid", 0) + self._ts = float(self._event.get("ts", 0)) + self._dur = float(self._event.get("dur", 0)) + self._ph = self._event.get("ph", "") + self._cat = self._event.get("cat", "") + self._name = self._event.get("name", "") + self._args = self._event.get("args", {}) diff --git a/profiler/compare_tools/profiling_analysis/parser_helper.py b/profiler/compare_tools/compare_bean/profiling_info.py similarity index 91% rename from profiler/compare_tools/profiling_analysis/parser_helper.py rename to profiler/compare_tools/compare_bean/profiling_info.py index caf09056ee..b724987acd 100644 --- a/profiler/compare_tools/profiling_analysis/parser_helper.py +++ b/profiler/compare_tools/compare_bean/profiling_info.py @@ -12,12 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import json -import os +from utils.constant import Constant class ProfilingInfo: + TABLE_NAME = Constant.PERFORMANCE_TABLE + HEADERS = [] + OVERHEAD = [] + def __init__(self, profiling_type: str): self.profiling_type = profiling_type self.cube_time = 0.0 @@ -30,7 +32,6 @@ class ProfilingInfo: self.fa_num_bwd = 0 self.compute_time = 0.0 self.communication_not_overlapped = 0.0 - self.scheduling_ratio = 0.0 self.memory_used = 0.0 self.e2e_time = 0.0 self.sdma_time = 0.0 diff --git a/profiler/compare_tools/generation/base_generator.py b/profiler/compare_tools/generation/base_generator.py deleted file mode 100644 index e65bf33738..0000000000 --- a/profiler/compare_tools/generation/base_generator.py +++ /dev/null @@ -1,11 +0,0 @@ -from abc import ABC, abstractmethod - - -class BaseGenerator(ABC): - def __init__(self, sheet_name: str, data: any): - self.sheet_name = sheet_name - self.data = data - - @abstractmethod - def generate_data(self): - raise NotImplementedError("Function generate_data need to be implemented.") diff --git a/profiler/compare_tools/generation/communication_compare_generator.py b/profiler/compare_tools/generation/communication_compare_generator.py deleted file mode 100644 index 243d6a5150..0000000000 --- a/profiler/compare_tools/generation/communication_compare_generator.py +++ /dev/null @@ -1,50 +0,0 @@ -import math - -import pandas as pd - -from generation.base_generator import BaseGenerator -from utils.args_manager import ArgsManager -from utils.common_func import calculate_diff_ratio -from utils.constant import Constant - - -class CommunicationCompareGenerator(BaseGenerator): - def __init__(self, data: list): - super().__init__(Constant.COMMUNICATION_SHEET, data) - self._base_task_data = ArgsManager().base_profiling.communication_task_data - self._comparison_task_data = ArgsManager().comparison_profiling.communication_task_data - - def generate_data(self): - result_data = [] - row_headers = ["base_op", "base_task", "base_calls", "base_total_dur", "base_avg_dur", "base_max_dur", - "base_min_dur", "com_op", "com_task", "com_calls", "com_total_dur", "com_avg_dur", "com_max_dur", - "com_min_dur"] - for row in self.data: - if ArgsManager().base_profiling_path == ArgsManager().comparison_profiling_path: - result_data.append(row + [None, None]) - else: - result_data.append(row + calculate_diff_ratio(row[row_headers.index("base_total_dur")], - row[row_headers.index("com_total_dur")])) - base_data = self._get_task_statistic(row[row_headers.index("base_op")], is_base=True) - comparison_data = self._get_task_statistic(row[row_headers.index("com_op")], is_base=False) - for index in range(max(len(base_data), len(comparison_data))): - if index >= len(base_data): - base_row = ["|"] + [None] * 6 - else: - base_row = ["|"] + base_data[index] - if index >= len(comparison_data): - comparison_row = ["|"] + [None] * 6 - else: - comparison_row = ["|"] + comparison_data[index] - result_data.append(base_row + comparison_row + [None, None]) - return result_data - - def _get_task_statistic(self, name: str, is_base: bool): - if not name: - return [] - task_list = self._base_task_data.get(name) if is_base else self._comparison_task_data.get(name) - if task_list: - data = [[data.get("name", ""), float(data.get("dur", 0))] for data in task_list] - df = pd.DataFrame(data, columns=[Constant.OP_KEY, Constant.DEVICE_DUR]) - return df.groupby(Constant.OP_KEY).agg(["count", "sum", "mean", "max", "min"]).reset_index().values.tolist() - return [] diff --git a/profiler/compare_tools/generation/comparison_generator.py b/profiler/compare_tools/generation/comparison_generator.py deleted file mode 100644 index 44798cb954..0000000000 --- a/profiler/compare_tools/generation/comparison_generator.py +++ /dev/null @@ -1,33 +0,0 @@ -from comparator.index_comparator import IndexComparator -from comparator.op_comparator import OpComparator -from generation.communication_compare_generator import CommunicationCompareGenerator -from generation.memory_compare_generator import MemoryCompareGenerator -from generation.memory_statistic_generator import MemoryStatisticGenerator -from generation.operator_compare_generator import OperatorCompareGenerator -from generation.operator_statistic_generator import OperatorStatisticGenerator -from view.excel_view import ExcelViewer -from utils.constant import Constant -from utils.args_manager import ArgsManager -from utils.torch_op_node import TorchOpNode -from utils.tree_builder import TreeBuilder - - -class ComparisonGenerator: - def __init__(self, args: any): - self._args = args - self._args_manager = ArgsManager() - - def run(self, file_path: str): - data_dict = {} - if self._args.enable_operator_compare or self._args.enable_memory_compare: - op_compare_result = OpComparator(self._args).compare() - if self._args.enable_communication_compare: - index_compare_result = IndexComparator(self._args).compare() - data_dict[Constant.COMMUNICATION_SHEET] = CommunicationCompareGenerator(index_compare_result).generate_data() - if self._args.enable_operator_compare: - data_dict[Constant.OPERATOR_SHEET] = OperatorCompareGenerator(op_compare_result).generate_data() - data_dict[Constant.OPERATOR_TOP_SHEET] = OperatorStatisticGenerator(op_compare_result).generate_data() - if self._args.enable_memory_compare: - data_dict[Constant.MEMORY_SHEET] = MemoryCompareGenerator(op_compare_result).generate_data() - data_dict[Constant.MEMORY_TOP_SHEET] = MemoryStatisticGenerator(op_compare_result).generate_data() - ExcelViewer(data_dict, file_path).generate_view() diff --git a/profiler/compare_tools/generation/memory_compare_generator.py b/profiler/compare_tools/generation/memory_compare_generator.py deleted file mode 100644 index 2cf919d1f6..0000000000 --- a/profiler/compare_tools/generation/memory_compare_generator.py +++ /dev/null @@ -1,37 +0,0 @@ -from generation.base_generator import BaseGenerator -from utils.args_manager import ArgsManager -from utils.common_func import calculate_diff_ratio -from utils.constant import Constant -from utils.torch_op_node import TorchOpNode -from utils.tree_builder import TreeBuilder - - -class MemoryCompareGenerator(BaseGenerator): - def __init__(self, data: list): - super().__init__(Constant.MEMORY_SHEET, data) - - def generate_data(self): - def get_row_info(torch_op_node: TorchOpNode): - if not torch_op_node: - return [None] * 4 + [0] - memory_list = TreeBuilder.get_total_memory(torch_op_node) - size = 0 - memory_details = "" - for memory in memory_list: - size += memory.size - memory_details += memory.memory_details - return [torch_op_node.name, torch_op_node.input_shape, torch_op_node.input_type, memory_details, size] - - if not self.data: - return [] - data = [None] * (len(self.data)) - for index, (base_op, comparison_op) in enumerate(self.data): - base_row = get_row_info(base_op) - if ArgsManager().base_profiling_path == ArgsManager().comparison_profiling_path: - comparison_row = [None] * 5 - diff_ratio = [None] * 2 - else: - comparison_row = get_row_info(comparison_op) - diff_ratio = calculate_diff_ratio(base_row[-1], comparison_row[-1]) - data[index] = base_row + comparison_row + diff_ratio - return data diff --git a/profiler/compare_tools/generation/memory_statistic_generator.py b/profiler/compare_tools/generation/memory_statistic_generator.py deleted file mode 100644 index 652e73a38d..0000000000 --- a/profiler/compare_tools/generation/memory_statistic_generator.py +++ /dev/null @@ -1,53 +0,0 @@ -from generation.base_generator import BaseGenerator -from utils.args_manager import ArgsManager -from utils.common_func import calculate_diff_ratio -from utils.constant import Constant -from utils.tree_builder import TreeBuilder - - -class MemoryStatisticGenerator(BaseGenerator): - def __init__(self, data: list): - super().__init__(Constant.MEMORY_TOP_SHEET, data) - - def generate_data(self): - base_op_dict, comparison_op_dict = {}, {} - for base_op, comparison_op in self.data: - if base_op: - memory_list = TreeBuilder.get_total_memory(base_op) - size = sum([memory.size / Constant.KB_TO_MB for memory in memory_list]) - duration = sum([memory.duration / Constant.US_TO_MS for memory in memory_list]) - base_op_dict.setdefault(base_op.name, {}).setdefault("size", []).append(size) - base_op_dict.setdefault(base_op.name, {}).setdefault("duration", []).append(duration) - if comparison_op: - memory_list = TreeBuilder.get_total_memory(comparison_op) - size = sum([memory.size / Constant.KB_TO_MB for memory in memory_list]) - duration = sum([memory.duration / Constant.US_TO_MS for memory in memory_list]) - comparison_op_dict.setdefault(comparison_op.name, {}).setdefault("size", []).append(size) - comparison_op_dict.setdefault(comparison_op.name, {}).setdefault("duration", []).append(duration) - result_data = [] - for op_name, base_data in base_op_dict.items(): - base_dur = sum(base_data.get("duration", [])) - base_size = sum(base_data.get("size", [])) - base_num = len(base_data.get("size", [])) - comparison_data = comparison_op_dict.pop(op_name, None) - if ArgsManager().base_profiling_path == ArgsManager().comparison_profiling_path: - result_data.append([op_name, base_dur, base_size, base_num] + [None] * 5) - elif comparison_data: - comparison_dur = sum(comparison_data.get("duration", [])) - comparison_size = sum(comparison_data.get("size", [])) - comparison_num = len(comparison_data.get("size", [])) - result_data.append( - [op_name, base_dur, base_size, base_num, comparison_dur, comparison_size, - comparison_num] + calculate_diff_ratio(base_size, comparison_size)) - else: - result_data.append( - [op_name, base_dur, base_size, base_num, 0, 0, 0] + calculate_diff_ratio(base_size, 0)) - for op_name, comparison_data_dict in comparison_op_dict.items(): - comparison_dur = sum(comparison_data_dict.get("duration", [])) - comparison_size = sum(comparison_data_dict.get("size", [])) - comparison_num = len(comparison_data_dict.get("size", [])) - result_data.append([op_name, 0, 0, 0, comparison_dur, comparison_size, comparison_num] + - calculate_diff_ratio(0, comparison_size)) - if ArgsManager().base_profiling_path != ArgsManager().comparison_profiling_path: - result_data.sort(key=lambda x: x[-2], reverse=True) - return result_data diff --git a/profiler/compare_tools/generation/operator_compare_generator.py b/profiler/compare_tools/generation/operator_compare_generator.py deleted file mode 100644 index 0f876a3ed8..0000000000 --- a/profiler/compare_tools/generation/operator_compare_generator.py +++ /dev/null @@ -1,39 +0,0 @@ -from generation.base_generator import BaseGenerator -from utils.args_manager import ArgsManager -from utils.common_func import calculate_diff_ratio -from utils.constant import Constant -from utils.torch_op_node import TorchOpNode -from utils.tree_builder import TreeBuilder - - -class OperatorCompareGenerator(BaseGenerator): - def __init__(self, data: list): - super().__init__(Constant.OPERATOR_SHEET, data) - - def generate_data(self): - def get_row_info(torch_op_node: TorchOpNode): - if not torch_op_node: - return [None] * 4 + [0] - kernel_list = TreeBuilder.get_total_kernels(torch_op_node) - duration = 0 - kernel_details = "" - for kernel in kernel_list: - duration += kernel.device_dur - kernel_details += kernel.kernel_details - return [torch_op_node.name, torch_op_node.input_shape, torch_op_node.input_type, kernel_details, duration] - - if not self.data: - return [] - data = [None] * (len(self.data)) - index = 0 - for base_op, comparison_op in self.data: - base_row = get_row_info(base_op) - if ArgsManager().base_profiling_path == ArgsManager().comparison_profiling_path: - comparison_row = [None] * 5 - diff_ratio = [None] * 2 - else: - comparison_row = get_row_info(comparison_op) - diff_ratio = calculate_diff_ratio(base_row[-1], comparison_row[-1]) - data[index] = base_row + comparison_row + diff_ratio - index += 1 - return data diff --git a/profiler/compare_tools/generation/operator_statistic_generator.py b/profiler/compare_tools/generation/operator_statistic_generator.py deleted file mode 100644 index ec685c42f4..0000000000 --- a/profiler/compare_tools/generation/operator_statistic_generator.py +++ /dev/null @@ -1,43 +0,0 @@ -from generation.base_generator import BaseGenerator -from utils.args_manager import ArgsManager -from utils.common_func import calculate_diff_ratio -from utils.constant import Constant -from utils.tree_builder import TreeBuilder - - -class OperatorStatisticGenerator(BaseGenerator): - def __init__(self, data: list): - super().__init__(Constant.OPERATOR_TOP_SHEET, data) - - def generate_data(self): - base_op_dict, comparison_op_dict = {}, {} - for base_op, comparison_op in self.data: - if base_op: - kernel_list = TreeBuilder.get_total_kernels(base_op) - duration = sum([kernel.device_dur / Constant.US_TO_MS for kernel in kernel_list]) - base_op_dict.setdefault(base_op.name, []).append(duration) - if comparison_op: - kernel_list = TreeBuilder.get_total_kernels(comparison_op) - duration = sum([kernel.device_dur / Constant.US_TO_MS for kernel in kernel_list]) - comparison_op_dict.setdefault(comparison_op.name, []).append(duration) - result_data = [] - for op_name, base_duration_list in base_op_dict.items(): - base_dur = sum(base_duration_list) - comparison_duration_list = comparison_op_dict.pop(op_name, None) - if ArgsManager().base_profiling_path == ArgsManager().comparison_profiling_path: - result_data.append([op_name, base_dur, len(base_duration_list)] + [None] * 4) - elif comparison_duration_list: - comparison_dur = sum(comparison_duration_list) - result_data.append( - [op_name, base_dur, len(base_duration_list), comparison_dur, - len(comparison_duration_list)] + calculate_diff_ratio(base_dur, comparison_dur)) - else: - result_data.append( - [op_name, base_dur, len(base_duration_list), 0, 0] + calculate_diff_ratio(base_dur, 0)) - for op_name, comparison_duration_list in comparison_op_dict.items(): - comparison_dur = sum(comparison_duration_list) - result_data.append([op_name, 0, 0, comparison_dur, len(comparison_duration_list)] + - calculate_diff_ratio(0, comparison_dur)) - if ArgsManager().base_profiling_path != ArgsManager().comparison_profiling_path: - result_data.sort(key=lambda x: x[-2], reverse=True) - return result_data diff --git a/profiler/compare_tools/generator/__init__.py b/profiler/compare_tools/generator/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/profiler/compare_tools/generator/base_generator.py b/profiler/compare_tools/generator/base_generator.py new file mode 100644 index 0000000000..c472bc9922 --- /dev/null +++ b/profiler/compare_tools/generator/base_generator.py @@ -0,0 +1,22 @@ +from abc import ABC, abstractmethod +from multiprocessing import Process + + +class BaseGenerator(Process, ABC): + def __init__(self, profiling_data_dict: dict, args: any): + super(BaseGenerator, self).__init__() + self._profiling_data_dict = profiling_data_dict + self._args = args + self._result_data = {} + + def run(self): + self.compare() + self.generate_view() + + @abstractmethod + def compare(self): + raise NotImplementedError("Function compare need to be implemented.") + + @abstractmethod + def generate_view(self): + raise NotImplementedError("Function generate_view need to be implemented.") diff --git a/profiler/compare_tools/generator/comparison_generator.py b/profiler/compare_tools/generator/comparison_generator.py new file mode 100644 index 0000000000..2bd4175dba --- /dev/null +++ b/profiler/compare_tools/generator/comparison_generator.py @@ -0,0 +1,36 @@ +from generator.detail_performance_generator import DetailPerformanceGenerator +from generator.overall_performance_generator import OverallPerformanceGenerator +from profiling_parser.gpu_profiling_parser import GPUProfilingParser +from profiling_parser.npu_profiling_parser import NPUProfilingParser +from utils.constant import Constant +from utils.args_manager import ArgsManager + + +class ComparisonGenerator: + PARSER_DICT = {Constant.NPU: NPUProfilingParser, Constant.GPU: GPUProfilingParser} + + def __init__(self): + self._args_manager = ArgsManager() + self._overall_data = None + self._details_data = None + + def run(self): + self.load_data() + self.generate_compare_result() + + def load_data(self): + base_data = self.PARSER_DICT.get(self._args_manager.base_profiling_type)( + self._args_manager.args, self._args_manager.base_path_dict).load_data() + comparison_data = self.PARSER_DICT.get(self._args_manager.comparison_profiling_type)( + self._args_manager.args, self._args_manager.comparison_path_dict).load_data() + self._overall_data = {Constant.BASE_DATA: base_data.pop(Constant.OVERALL_METRICS, None), + Constant.COMPARISON_DATA: comparison_data.pop(Constant.OVERALL_METRICS, None)} + self._details_data = {Constant.BASE_DATA: base_data, Constant.COMPARISON_DATA: comparison_data} + + def generate_compare_result(self): + generator_list = [OverallPerformanceGenerator(self._overall_data, self._args_manager.args), + DetailPerformanceGenerator(self._details_data, self._args_manager.args)] + for generator in generator_list: + generator.start() + for generator in generator_list: + generator.join() diff --git a/profiler/compare_tools/comparator/op_comparator.py b/profiler/compare_tools/generator/detail_performance_generator.py similarity index 59% rename from profiler/compare_tools/comparator/op_comparator.py rename to profiler/compare_tools/generator/detail_performance_generator.py index 8ccd428ef8..cf474544ca 100644 --- a/profiler/compare_tools/comparator/op_comparator.py +++ b/profiler/compare_tools/generator/detail_performance_generator.py @@ -1,26 +1,67 @@ +import os from collections import deque +from datetime import datetime import numpy as np -from utils.args_manager import ArgsManager +from comparator.communication_comparator import CommunicationComparator +from comparator.operator_comparator import OperatorComparator +from comparator.operator_statistic_generator import OperatorStatisticComparator +from compare_bean.communication_bean import CommunicationBean +from compare_bean.memory_compare_bean import MemoryCompareBean +from compare_bean.memory_statistic_bean import MemoryStatisticBean +from compare_bean.operator_compare_bean import OperatorCompareBean +from compare_bean.operator_statistic_bean import OperatorStatisticBean +from generator.base_generator import BaseGenerator +from profiler.cluster_analyse.common_func.path_manager import PathManager +from utils.constant import Constant from utils.name_function import NameFunction from utils.torch_op_node import TorchOpNode from utils.tree_builder import TreeBuilder +from view.excel_view import ExcelView -class OpComparator: - def __init__(self, args: any): - self._args = args - self._args_manager = ArgsManager() - self._base_profiling = self._args_manager.base_profiling - self._comparison_profiling = self._args_manager.comparison_profiling +class DetailPerformanceGenerator(BaseGenerator): + def __init__(self, profiling_data_dict: dict, args: any): + super().__init__(profiling_data_dict, args) - def compare(self) -> list: - base_ops = self._get_top_layer_ops(self._base_profiling) - if self._args.base_profiling_path == self._args.comparison_profiling_path: - comparison_ops = [] - else: - comparison_ops = self._get_top_layer_ops(self._comparison_profiling) + def compare(self): + if self._args.enable_operator_compare or self._args.enable_memory_compare or \ + self._args.enable_communication_compare: + print("[INFO] Start to compare performance detail data, please wait.") + comparator_list = self._create_comparator() + for comparator in comparator_list: + self._result_data.update(comparator.generate_data()) + + def generate_view(self): + if not self._result_data: + return + dir_path = self._args.output_path if self._args.output_path else "./" + file_name = "performance_comparison_result_{}.xlsx".format(datetime.utcnow().strftime("%Y%m%d%H%M%S")) + result_file_path = PathManager.get_realpath(os.path.join(dir_path, file_name)) + ExcelView(self._result_data, result_file_path, self._args).generate_view() + print(f"[INFO] The comparison result file has been generated: {result_file_path}") + + def _create_comparator(self): + comparator_list = [] + if self._args.enable_operator_compare or self._args.enable_memory_compare: + op_compare_result = self.match_torch_op() + + if self._args.enable_communication_compare: + comparator_list.append(CommunicationComparator(self._profiling_data_dict, CommunicationBean)) + + if self._args.enable_operator_compare: + comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean)) + comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean)) + + if self._args.enable_memory_compare: + comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean)) + comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean)) + return comparator_list + + def match_torch_op(self) -> list: + base_ops = self._get_top_layer_ops(self._profiling_data_dict.get(Constant.BASE_DATA, {})) + comparison_ops = self._get_top_layer_ops(self._profiling_data_dict.get(Constant.COMPARISON_DATA, {})) if not base_ops and not comparison_ops: return [] name_func = NameFunction(self._args).get_name_func() @@ -29,7 +70,6 @@ class OpComparator: compare_result_data = self._drill_down(compare_result_data, name_func) return compare_result_data - @classmethod def _matching_op(cls, base_ops: list, comparison_ops: list, name_func: any) -> list: if not comparison_ops: @@ -83,21 +123,14 @@ class OpComparator: result_data.append([None, comparison_ops[comparison_index]]) return result_data - def _get_top_layer_ops(self, profiling_instance: any) -> any: - torch_op_data = profiling_instance.torch_op_data - if not torch_op_data: - print(f"[WARNING] Can't find any torch op in the file: {profiling_instance.json_path}") + def _get_top_layer_ops(self, profiling_data: any) -> any: + torch_op_data = profiling_data.pop(Constant.TORCH_OP, []) root_node = TreeBuilder.build_tree(torch_op_data) - kernel_dict, memory_list = {}, [] if self._args.enable_operator_compare: - kernel_dict = profiling_instance.kernel_dict - if not kernel_dict: - print(f"[WARNING] Can't find any flow event in the file: {profiling_instance.json_path}") + kernel_dict = profiling_data.pop(Constant.KERNEL_DICT, {}) if self._args.enable_memory_compare: - memory_list = profiling_instance.memory_list - if not memory_list: - print(f"[WARNING] Can't find any memory event in the file: {profiling_instance.file_path}") + memory_list = profiling_data.pop(Constant.MEMORY_LIST, []) TreeBuilder.update_tree_node(root_node, kernel_dict, memory_list) level1_child_nodes = root_node.child_nodes diff --git a/profiler/compare_tools/generator/overall_performance_generator.py b/profiler/compare_tools/generator/overall_performance_generator.py new file mode 100644 index 0000000000..d2aa181371 --- /dev/null +++ b/profiler/compare_tools/generator/overall_performance_generator.py @@ -0,0 +1,19 @@ +from comparator.overall_performance_comparator import OverallPerformanceComparator +from compare_bean.profiling_info import ProfilingInfo +from generator.base_generator import BaseGenerator +from view.screen_view import ScreenView + + +class OverallPerformanceGenerator(BaseGenerator): + def __init__(self, profiling_data_dict: dict, args: any): + super().__init__(profiling_data_dict, args) + + def compare(self): + if not self._args.enable_profiling_compare: + return + self._result_data = OverallPerformanceComparator(self._profiling_data_dict, ProfilingInfo).generate_data() + + def generate_view(self): + if not self._result_data: + return + ScreenView(self._result_data).generate_view() diff --git a/profiler/compare_tools/performance_compare.py b/profiler/compare_tools/performance_compare.py index 4ab8bb8985..6218c7e969 100644 --- a/profiler/compare_tools/performance_compare.py +++ b/profiler/compare_tools/performance_compare.py @@ -3,21 +3,12 @@ import ast import datetime import os.path import sys -import time sys.path.append( os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "cluster_analyse")) -from generation.comparison_generator import ComparisonGenerator +from generator.comparison_generator import ComparisonGenerator from utils.args_manager import ArgsManager -from profiling_analysis.profiling_parse import prof_main -from common_func.path_manager import PathManager - - -def performance_compare(args): - if not args.enable_profiling_compare: - return - prof_main() def main(): @@ -37,20 +28,7 @@ def main(): args = parser.parse_args() ArgsManager().init(args) - - try: - performance_compare(args) - except Exception: - print("[WARNING] Profiling failed to analyze.") - - if any([args.enable_operator_compare, args.enable_memory_compare, args.enable_communication_compare]): - print("[INFO] Start to compare performance data, please wait.") - dir_path = args.output_path if args.output_path else "./" - file_name = "performance_comparison_result_{}.xlsx".format( - time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) - result_file_path = PathManager.get_realpath(os.path.join(dir_path, file_name)) - ComparisonGenerator(args).run(result_file_path) - print(f"[INFO] The comparison result file has been generated: {result_file_path}") + ComparisonGenerator().run() if __name__ == "__main__": diff --git a/profiler/compare_tools/profiling_analysis/__init__.py b/profiler/compare_tools/profiling_analysis/__init__.py deleted file mode 100644 index 8400fd5ecd..0000000000 --- a/profiler/compare_tools/profiling_analysis/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/profiler/compare_tools/profiling_analysis/gpu_parser.py b/profiler/compare_tools/profiling_analysis/gpu_parser.py deleted file mode 100644 index 8f1b6d9c03..0000000000 --- a/profiler/compare_tools/profiling_analysis/gpu_parser.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from collections import Counter, defaultdict -import pandas as pd - -import profiling_analysis.parser_helper as parser_helper -from utils.file_reader import FileReader -from utils.constant import Constant - - -class OpTimeWarper: - def __init__( - self, - cube_time: float = 0.0, - sdma_time: float = 0.0, - vec_time: float = 0.0, - fa_time_fwd: float = 0.0, - fa_time_bwd: float = 0.0, - all_op_time: float = 0.0, - compute_stream_dur: float = 0.0, - cube_num: int = 0, - vec_num: int = 0, - sdma_num: int = 0, - fa_num_bwd: int = 0, - fa_num_fwd: int = 0 - ): - self.cube_time = cube_time - self.sdma_time = sdma_time - self.vec_time = vec_time - self.fa_time_fwd = fa_time_fwd - self.fa_time_bwd = fa_time_bwd - self.all_op_time = all_op_time - self.compute_stream_dur = compute_stream_dur - self.cube_num = cube_num - self.vec_num = vec_num - self.sdma_num = sdma_num - self.fa_num_bwd = fa_num_bwd - self.fa_num_fwd = fa_num_fwd - - -class GpuProfilingParser: - NCCL_MARK = 'nccl' - CUBE_MARK = 'gemm' - FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel']] - SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)'] - - def __init__(self, gpu_path): - self.trace_events = FileReader.read_trace_file(gpu_path).get('traceEvents') - self.compute_stream_id = self.infer_compute_stream_id() - self.one_step_time = 0 - self.profiling_info = parser_helper.ProfilingInfo('GPU') - - def is_flash_attention(self, name: str): - for fa_mark in self.FA_MARK_LIST: - if not len([1 for mark in fa_mark if mark not in name.lower()]): - return True - return False - - def is_sdma_time(self, name: str): - for mark in self.SDMA_MARK_LIST: - if mark in name.lower(): - return True - return False - - def update_op_list(self, op_list, marks): - cube_time = 0.0 - all_op_time = 0.0 - fa_time_bwd = 0.0 - fa_time_fwd = 0.0 - sdma_time = 0.0 - vec_time = 0.0 - cube_num = 0 - vec_num = 0 - sdma_num = 0 - fa_num_bwd = 0 - fa_num_fwd = 0 - compute_stream_dur = 0.0 - for event in self.trace_events: - if not isinstance(event, dict): - continue - if event.get('args') and event.get('args').get('stream') == self.compute_stream_id: - compute_stream_dur += float(event.get('dur')) - if not {'name', 'cat', 'dur', 'ts'} < event.keys(): - continue - name = event.get('name') - dur = event.get('dur') - ts = event.get('ts') - cat = event.get('cat', '') - if event.get('args') and event.get('args').get('stream') == self.compute_stream_id: - if self.is_sdma_time(name): - sdma_time += float(dur) - sdma_num += 1 - continue - if cat.lower() != 'kernel': - continue - if self.NCCL_MARK in name.lower(): - for timestep in range(ts + 1, ts + dur + 1): - marks[str(timestep)] += 1 # mark this timestep in communication stream - continue - else: - for timestep in range(ts + 1, ts + dur + 1): - marks[str(timestep)] += -100 # mark this timestep in compute stream - if self.is_flash_attention(name): - if 'bwd' in name.lower(): - fa_time_bwd += float(dur) - fa_num_bwd += 1 - else: - fa_time_fwd += float(dur) - fa_num_fwd += 1 - elif self.CUBE_MARK in name.lower(): - cube_num += 1 - cube_time += float(dur) - else: - vec_num += 1 - vec_time += float(dur) - all_op_time += float(dur) - op_list.append([ts, name, cat, dur]) - time_wrapper = OpTimeWarper( - cube_time=cube_time, - sdma_time=sdma_time, - vec_time=vec_time, - fa_time_fwd=fa_time_fwd, - fa_time_bwd=fa_time_bwd, - all_op_time=all_op_time, - compute_stream_dur=compute_stream_dur, - cube_num=cube_num, - vec_num=vec_num, - sdma_num=sdma_num, - fa_num_bwd=fa_num_bwd, - fa_num_fwd=fa_num_fwd - ) - return time_wrapper - - def parse_events(self): - op_list = [] - marks = defaultdict(int) # mark for compute communication_not_overlapped time - - time_wrapper = self.update_op_list(op_list, marks) - cube_time = time_wrapper.cube_time - fa_time_fwd = time_wrapper.fa_time_fwd - fa_time_bwd = time_wrapper.fa_time_bwd - all_op_time = time_wrapper.all_op_time - compute_stream_dur = time_wrapper.compute_stream_dur - cube_num = time_wrapper.cube_num - vec_num = time_wrapper.vec_num - sdma_num = time_wrapper.sdma_num - sdma_time = time_wrapper.sdma_time - vec_time = time_wrapper.vec_time - - self.profiling_info.compute_time = len([_ for _, value in marks.items() if value < 0]) / 10 ** 6 - self.profiling_info.communication_not_overlapped = len([_ for _, value in marks.items() if value > 0]) / 10 ** 6 - self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6 - self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6 - self.profiling_info.cube_time = cube_time / 10 ** 6 - self.profiling_info.vec_time = self.profiling_info.compute_time - (cube_time + fa_time_fwd + fa_time_bwd) / 10 ** 6 - self.profiling_info.cube_num = cube_num - self.profiling_info.vec_num = vec_num - self.profiling_info.sdma_num = sdma_num - self.profiling_info.fa_num_bwd = time_wrapper.fa_num_bwd - self.profiling_info.fa_num_fwd = time_wrapper.fa_num_fwd - self.profiling_info.sdma_time = sdma_time / 10 ** 6 - self.parse_e2e_time() - - self.profiling_info.scheduling_time = self.profiling_info.e2e_time - self.profiling_info.compute_time - \ - self.profiling_info.communication_not_overlapped - if self.profiling_info.e2e_time < Constant.EPS: - self.profiling_info.scheduling_ratio = 0.0 - else: - self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time - self.parse_memory_reserved() - - def parse_e2e_time(self): - compute_events_timeline = [event for event in self.trace_events if - event.get('args') and event.get('args').get('stream')] - compute_events_timeline = sorted(compute_events_timeline, key=lambda event: event.get('ts')) - self.profiling_info.e2e_time = (compute_events_timeline[-1].get('ts') + compute_events_timeline[-1].get('dur') - - compute_events_timeline[0].get('ts')) / 10 ** 6 - - def parse_memory_reserved(self): - memories = [ - event.get('args').get('Total Reserved') for event in self.trace_events - if event.get('name', '').lower() == '[memory]' and event.get('args').get('Device Id') >= 0 - ] - if not memories: - print("[INFO] Gpu profiling data doesn't contain memory info") - return - self.profiling_info.memory_used = max(memories) / 1024 ** 3 - - def infer_compute_stream_id(self): - kernel_stream_ids = [] - for event in self.trace_events: - is_kernel_exec_event = event.get('cat', '').lower() == 'kernel' and self.NCCL_MARK not in event.get('name', '').lower() - has_stream_id_event = event.get('args') and event.get('args').get('stream') - if is_kernel_exec_event and has_stream_id_event: - kernel_stream_ids.append(event.get('args').get('stream')) - if not kernel_stream_ids: - raise RuntimeError('[ERROR] The profiling data does not contain kernel running data.') - counter = Counter(kernel_stream_ids) - return counter.most_common(1)[0][0] diff --git a/profiler/compare_tools/profiling_analysis/npu_parser.py b/profiler/compare_tools/profiling_analysis/npu_parser.py deleted file mode 100644 index 47634389a1..0000000000 --- a/profiler/compare_tools/profiling_analysis/npu_parser.py +++ /dev/null @@ -1,293 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -from collections import defaultdict -import pandas as pd -import profiling_analysis.parser_helper as parser_helper -from utils.file_reader import FileReader -from common_func.path_manager import PathManager -from common_func.file_manager import FileManager - - -class NpuInfoWrapper: - def __init__( - self, - compute_time: int, - communication_time: int, - sdma_time: int, - sdma_num: int, - is_cluster: bool, - event_wait_sqe: dict, - ai_core_dict: dict, - event_wait_sqe_res: dict, - ai_core_res: dict, - ): - self.compute_time = compute_time - self.communication_time = communication_time - self.sdma_time = sdma_time - self.sdma_num = sdma_num - self.is_cluster = is_cluster - self.event_wait_sqe = event_wait_sqe - self.ai_core_dict = ai_core_dict - self.event_wait_sqe_res = event_wait_sqe_res - self.ai_core_res = ai_core_res - - -class NpuProfilingParser: - FLASH_ATTENTION = "flashattention" - ACLNNINPLACE_COPY = "aclnninplacecopy" - TENSORMOVE = "tensormove" - - def __init__(self, npu_step_time, npu_file_path): - self.npu_json_file = npu_file_path.get('trace_view') - self.npu_summary_file = npu_file_path.get('kernel_details') - self.npu_mem_file = npu_file_path.get('memory_record') - self.info_json = npu_file_path.get('info') - self.profiling_info = parser_helper.ProfilingInfo('NPU') - self.npu_step_time = npu_step_time - self.parallel_time = 0 - self.aicore_time = 0 - self.min_stream_ts = sys.float_info.max - self.max_stream_ts = sys.float_info.min - self.sdma_sqe = defaultdict(float) - self.sdma_num_cnt = defaultdict(int) - - def get_sdma_para(self, sdma_sqe, sdma_num_cnt, ai_core_dict, event_wait_sqe) -> (float, int): - compute_stream = [] - parallel_stream = [] - sdma_time = 0.0 - sdma_parallel_time = 0.0 - sdma_num = 0 - sdma_parallel_num = 0 - if len(ai_core_dict) == 1: - compute_stream.append(min(ai_core_dict.keys())) - elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) - compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) - parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) - else: - print('[WARNING] Npu Compute Stream Num Error.') - if parallel_stream: - sdma_parallel_time = sdma_sqe[parallel_stream[0]] - sdma_parallel_num = sdma_num_cnt[parallel_stream[0]] - if compute_stream: - sdma_time = sdma_sqe[compute_stream[0]] + sdma_parallel_time - sdma_num = sdma_num_cnt[compute_stream[0]] + sdma_parallel_num - return sdma_time, sdma_num - - def parse_npu_json_events(self): - if not self.npu_json_file: - print('[WARNING] Npu trace json file is not available.') - return - compute_time = 0 - communication_time = 0 - min_ts = sys.float_info.max - max_ts = sys.float_info.min - is_cluster = False # 表明没有获取到compute time的耗时 - data = FileReader.read_trace_file(self.npu_json_file) - event_wait_sqe = defaultdict(list) - ai_core_dict = defaultdict(list) - event_wait_sqe_res = defaultdict(float) - ai_core_res = defaultdict(float) - for dic in data: - self.get_ts_by_task_type(dic, event_wait_sqe, ai_core_dict, event_wait_sqe_res, ai_core_res) - if ('name' in dic) and (dic.get('name', '') == 'Computing'): - is_cluster = True - ts = float(dic.get('ts', 0)) - dur = dic.get('dur') - compute_time += dur - min_ts = ts if ts < min_ts else min_ts - max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts - if ('name' in dic) and (dic.get('name', '') == 'Communication(Not Overlapped)'): - is_cluster = True - ts = float(dic.get('ts')) - dur = dic.get('dur') - communication_time += dur - min_ts = ts if ts < min_ts else min_ts - max_ts = (ts + dur) if (ts + dur) > max_ts else max_ts - sdma_time, sdma_num = self.get_sdma_para(self.sdma_sqe, self.sdma_num_cnt, ai_core_dict, event_wait_sqe) - npu_info_wrapper = NpuInfoWrapper( - compute_time, communication_time, sdma_time, sdma_num, is_cluster, - event_wait_sqe, ai_core_dict, event_wait_sqe_res, ai_core_res) - self.update_npu_info(max_ts - min_ts, npu_info_wrapper) - - def update_npu_info(self, ts_dur, npu_info_wrapper): - compute_time = npu_info_wrapper.compute_time - communication_time = npu_info_wrapper.communication_time - is_cluster = npu_info_wrapper.is_cluster - event_wait_sqe = npu_info_wrapper.event_wait_sqe - ai_core_dict = npu_info_wrapper.ai_core_dict - event_wait_sqe_res = npu_info_wrapper.event_wait_sqe_res - ai_core_res = npu_info_wrapper.ai_core_res - sdma_time = npu_info_wrapper.sdma_time - sdma_num = npu_info_wrapper.sdma_num - # AI_CORE和EVENT_WAIT_SQE共存为计算流 - compute_stream = [] - parallel_stream = [] - if not is_cluster: - #单机单卡没有overlap analysis - if len(ai_core_dict) == 1: - compute_stream.append(min(ai_core_dict.keys())) - elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) - compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) - parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) - else: - print('[WARNING] Npu trace json file lack of Stream info') - return - cs_event_wait_sqe_list = event_wait_sqe[compute_stream[0]] - if parallel_stream: - cs_ai_core_list = ai_core_dict[parallel_stream[0]] - sorted(cs_event_wait_sqe_list, key=lambda x: (x[0])) - sorted(cs_ai_core_list, key=lambda x: (x[0])) - self.parallel_time = self.interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) - self.profiling_info.compute_time = compute_time / 10 ** 6 if is_cluster else \ - ai_core_res[compute_stream[0]] / 10 ** 6 - self.profiling_info.other_time = max(0, self.profiling_info.compute_time - self.profiling_info.cube_time - \ - self.profiling_info.flash_attention_time_fwd - self.profiling_info.flash_attention_time_bwd - \ - self.profiling_info.vec_time) - self.profiling_info.e2e_time = ts_dur / 10 ** 6 if is_cluster else \ - (self.max_stream_ts - self.min_stream_ts) / 10 ** 6 - self.profiling_info.communication_not_overlapped = communication_time / 10 ** 6 \ - if is_cluster else (event_wait_sqe_res[compute_stream[0]] - self.parallel_time) / 10 ** 6 - time_required = self.profiling_info.compute_time + self.profiling_info.communication_not_overlapped - self.profiling_info.sdma_time += sdma_time / 10 ** 6 - self.profiling_info.sdma_num += sdma_num - if self.npu_step_time: - self.profiling_info.scheduling_time = self.npu_step_time - time_required - else: - self.profiling_info.scheduling_time = self.profiling_info.e2e_time - time_required - self.profiling_info.scheduling_ratio = self.profiling_info.scheduling_time / self.profiling_info.e2e_time \ - if self.profiling_info.e2e_time != 0 else 0 - - def parse_info_json(self): - if not self.info_json: - return - json_data = FileReader.read_trace_file(self.info_json) - if not json_data: - return - if "ProfilerActivity.CPU" in json_data.get('config', {}).get('common_config', {}).get('activities', []): - return - if 'Level0' != json_data.get('config', {}).get('experimental_config', {}).get('_profiler_level', ''): - return - self.profiling_info.minimal_profiling = True - - def parse_npu_csv_events(self): - self.parse_mem_csv() - if not self.npu_summary_file: - print('[WARNING] Npu kernel details csv file is not available.') - return - PathManager.check_path_readable(self.npu_summary_file) - FileManager.check_file_size(self.npu_summary_file) - info = pd.read_csv(self.npu_summary_file, index_col=None) - cube_time = 0.0 - vec_time = 0.0 - sdma_time = 0.0 - fa_time_fwd = 0.0 - fa_time_bwd = 0.0 - cube_num = 0 - vec_num = 0 - fa_num_bwd = 0 - fa_num_fwd = 0 - sdma_num = 0 - if info.get('mac_time(us)') is None and info.get('aiv_vec_time(us)') is None: - self.profiling_info.hide_op_details = True - return - for i in range(len(info['Model ID'])): - op_type = info.loc[i, 'Type'] - name = info.loc[i, 'Name'] - aiv_vec_time = info.loc[i, 'aiv_vec_time(us)'] if info.get('aiv_vec_time(us)') is not None else None - mac_time = info.loc[i, 'mac_time(us)'] if info.get('mac_time(us)') is not None else None - if pd.isna(aiv_vec_time) and pd.isna(mac_time): - continue - task_durations = info.loc[i, 'Duration(us)'] - if self.FLASH_ATTENTION in op_type.lower(): - if 'bwd' in op_type.lower() or 'grad' in op_type.lower(): - fa_time_bwd += task_durations - fa_num_bwd += 1 - else: - fa_time_fwd += task_durations - fa_num_fwd += 1 - elif name.lower().startswith(self.ACLNNINPLACE_COPY) and self.TENSORMOVE in name.lower(): - sdma_time += task_durations - sdma_num += 1 - else: - is_vec = (aiv_vec_time and aiv_vec_time > 0) or (mac_time is not None and mac_time == 0) - if is_vec: - vec_time += task_durations - vec_num += 1 - else: - cube_time += task_durations - cube_num += 1 - - self.profiling_info.cube_time = cube_time / 10 ** 6 - self.profiling_info.vec_time = vec_time / 10 ** 6 - self.profiling_info.flash_attention_time_bwd = fa_time_bwd / 10 ** 6 - self.profiling_info.flash_attention_time_fwd = fa_time_fwd / 10 ** 6 - self.profiling_info.cube_num = cube_num - self.profiling_info.vec_num = vec_num - self.profiling_info.fa_num_bwd = fa_num_bwd - self.profiling_info.fa_num_fwd = fa_num_fwd - self.profiling_info.sdma_time = sdma_time / 10 ** 6 - self.profiling_info.sdma_num = sdma_num - - - def parse_mem_csv(self): - if not self.npu_mem_file: - print('[INFO] Npu op memory csv file is not available.') - return - try: - PathManager.check_path_readable(self.npu_mem_file) - FileManager.check_file_size(self.npu_mem_file) - info = pd.read_csv(self.npu_mem_file, usecols=['Total Reserved(MB)'], index_col=None) - except ValueError: - print('[ERROR] Load memory info failed.') - else: - self.profiling_info.memory_used = max(info.get('Total Reserved(MB)')) / 1024 - - @staticmethod - def interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list): - ans = 0 - i = 0 - j = 0 - while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list): - lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0]) - hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1]) - if lo <= hi: - ans += (hi - lo) - if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]: - i += 1 - else: - j += 1 - return ans - - def get_ts_by_task_type(self, dic, event_wait_sqe, ai_core_dict, enent_wait_res, ai_core_res): - if not dic.get('args'): - return - args = dic.get('args') - if args.get('Stream Id'): - stream_id = args.get('Stream Id') - ts = float(dic.get('ts')) - dur = dic.get('dur') - if args.get('Task Type') == 'EVENT_WAIT_SQE': - enent_wait_res[stream_id] += dur - event_wait_sqe[stream_id].append([ts, ts + dur]) - elif args.get('Task Type') in ('SDMA_SQE', 'PCIE_DMA_SQE'): - self.sdma_sqe[stream_id] += dur - self.sdma_num_cnt[stream_id] += 1 - elif args.get('Task Type') in ('AI_CORE', 'MIX_AIC', 'MIX_AIV', 'AI_CPU', 'AI_VECTOR_CORE', 'FFTS_PLUS'): - ai_core_res[stream_id] += dur - ai_core_dict[stream_id].append([ts, ts + dur]) - self.min_stream_ts = ts if ts < self.min_stream_ts else self.min_stream_ts - self.max_stream_ts = (ts + dur) if (ts + dur) > self.max_stream_ts else self.max_stream_ts diff --git a/profiler/compare_tools/profiling_analysis/profiling_parse.py b/profiler/compare_tools/profiling_analysis/profiling_parse.py deleted file mode 100644 index adf182900f..0000000000 --- a/profiler/compare_tools/profiling_analysis/profiling_parse.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -from prettytable import PrettyTable - -from profiling_analysis.gpu_parser import GpuProfilingParser -from profiling_analysis.npu_parser import NpuProfilingParser -from profiling_analysis.parser_helper import ProfilingInfo -from utils.args_manager import ArgsManager -from utils.constant import Constant - - -def generate_table_info(base_profiling_info, comp_profiling_info, table): - headers = [''] - base_col = [f'{base_profiling_info.profiling_type}'] - comp_col = [f'{comp_profiling_info.profiling_type}'] - if not base_profiling_info.hide_op_details and not comp_profiling_info.hide_op_details: - headers.extend(['Cube Time(Num)', 'Vector Time(Num)']) - base_col.extend([f'{base_profiling_info.cube_time:.3f}s({base_profiling_info.cube_num})', - f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})']) - comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})', - f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})']) - if base_profiling_info.other_time or comp_profiling_info.other_time: - headers.append('Other Time') - base_col.append(f'{base_profiling_info.other_time:.3f}s') - comp_col.append(f'{comp_profiling_info.other_time:.3f}s') - if base_profiling_info.flash_attention_time_fwd or comp_profiling_info.flash_attention_time_fwd: - headers.append('Flash Attention Time(Forward)(Num)') - base_col.append(f'{base_profiling_info.flash_attention_time_fwd:.3f}s({base_profiling_info.fa_num_fwd})') - comp_col.append(f'{comp_profiling_info.flash_attention_time_fwd:.3f}s({comp_profiling_info.fa_num_fwd})') - if base_profiling_info.flash_attention_time_bwd or comp_profiling_info.flash_attention_time_bwd: - headers.append('Flash Attention Time(Backward)(Num)') - base_col.append(f'{base_profiling_info.flash_attention_time_bwd:.3f}s({base_profiling_info.fa_num_bwd})') - comp_col.append(f'{comp_profiling_info.flash_attention_time_bwd:.3f}s({comp_profiling_info.fa_num_bwd})') - headers.extend(['Computing Time']) - base_col.extend([f'{base_profiling_info.compute_time:.3f}s']) - comp_col.extend([f'{comp_profiling_info.compute_time:.3f}s']) - if base_profiling_info.memory_used or comp_profiling_info.memory_used: - headers.append('Mem Usage') - base_col.append(f'{base_profiling_info.memory_used:.2f}G') - comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') - headers.extend(['Uncovered Communication Time']) - base_col.extend( - [f'{base_profiling_info.communication_not_overlapped: .3f}s']) - comp_col.extend( - [f'{comp_profiling_info.communication_not_overlapped: .3f}s']) - if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: - headers.append('SDMA Time(Num)') - base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') - comp_col.append(f'{comp_profiling_info.sdma_time:.3f}s({comp_profiling_info.sdma_num})') - cue = '' - if ((base_profiling_info.profiling_type == "NPU" and not base_profiling_info.minimal_profiling) or - (comp_profiling_info.profiling_type == "NPU" and not comp_profiling_info.minimal_profiling)): - - cue = '(Not minimal profiling)' - - headers.extend(['Free Time', 'E2E Time' + cue]) - base_col.extend( - [f'{base_profiling_info.scheduling_time:.3f}s', f'{base_profiling_info.e2e_time:.3f}s']) - comp_col.extend( - [f'{comp_profiling_info.scheduling_time:.3f}s', f'{comp_profiling_info.e2e_time:.3f}s']) - table.field_names = headers - table.add_row(base_col) - table.add_row(comp_col) - - -def show_table(base_profiling_info, comp_profiling_info): - table = PrettyTable() - table.title = 'Model Profiling Time Distribution' - generate_table_info(base_profiling_info, comp_profiling_info, table) - print(table) - - -def parse_gpu(gpu_path): - gpu_parser = GpuProfilingParser(gpu_path) - gpu_parser.parse_events() - return gpu_parser.profiling_info - - -def parse_npu(npu_path): - npu_dir = {'trace_view': None, 'memory_record': None, 'kernel_details': None} - for root, _, files in os.walk(npu_path): - for file in files: - if file == 'trace_view.json': - npu_dir['trace_view'] = os.path.join(root, file) - if file == 'memory_record.csv': - npu_dir['memory_record'] = os.path.join(root, file) - if 'kernel_details' in file: - npu_dir['kernel_details'] = os.path.join(root, file) - if 'profiler_info' in file: - npu_dir['info'] = os.path.join(root, file) - - npu_parser = NpuProfilingParser(0, npu_dir) - npu_parser.parse_npu_csv_events() - npu_parser.parse_info_json() - npu_parser.parse_npu_json_events() - return npu_parser.profiling_info - - -def prof_main(): - base_info = ProfilingInfo('None') - comp_info = ProfilingInfo('None') - if ArgsManager().base_profiling_type == Constant.NPU: - base_info = parse_npu(ArgsManager().base_profiling.file_path) - elif ArgsManager().base_profiling_type == Constant.GPU: - base_info = parse_gpu(ArgsManager().base_profiling.file_path) - if ArgsManager().comparison_profiling_type == Constant.NPU: - comp_info = parse_npu(ArgsManager().comparison_profiling.file_path) - elif ArgsManager().comparison_profiling_type == Constant.GPU: - comp_info = parse_gpu(ArgsManager().comparison_profiling.file_path) - - show_table(base_info, comp_info) - - -if __name__ == '__main__': - prof_main() diff --git a/profiler/compare_tools/profiling_parser/__init__.py b/profiler/compare_tools/profiling_parser/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/profiler/compare_tools/profiling_parser/base_profiling_parser.py b/profiler/compare_tools/profiling_parser/base_profiling_parser.py new file mode 100644 index 0000000000..97aaf6b473 --- /dev/null +++ b/profiler/compare_tools/profiling_parser/base_profiling_parser.py @@ -0,0 +1,116 @@ +from abc import abstractmethod, ABC + +from compare_bean.origin_data_bean.compare_event import KernelEvent +from compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_bean.profiling_info import ProfilingInfo +from utils.args_manager import ArgsManager +from utils.constant import Constant +from utils.file_reader import FileReader + + +class BaseProfilingParser(ABC): + def __init__(self, args: any, path_dict: dict): + self._args = args + self._profiling_type = path_dict.get(Constant.PROFILING_TYPE) + self._profiling_path = path_dict.get(Constant.PROFILING_PATH) + self._json_path = path_dict.get(Constant.TRACE_PATH) + self._trace_events = FileReader.read_trace_file(self._json_path) + self._torch_op_data = [] + self._kernel_dict = {} + self._memory_list = [] + self._communication_dict = {} + self._overall_metrics = ProfilingInfo(path_dict.get(Constant.PROFILING_TYPE)) + self._enable_profiling_compare = ArgsManager().enable_profiling_compare + self._enable_operator_compare = ArgsManager().enable_operator_compare + self._enable_memory_compare = ArgsManager().enable_memory_compare + self._enable_communication_compare = ArgsManager().enable_communication_compare + self._dispatch_func = self._get_dispatch_func() + self._memory_events = [] + self._flow_dict = {} + self._all_kernels = {} + + @abstractmethod + def _update_memory_list(self): + raise NotImplementedError("Function update_memory_list need to be implemented.") + + @abstractmethod + def _calculate_performance_time(self): + raise NotImplementedError("Function update_memory_list need to be implemented.") + + @abstractmethod + def _update_overall_metrics(self): + raise NotImplementedError("Function update_memory_list need to be implemented.") + + @abstractmethod + def _picking_communication_event(self, **kwargs): + raise NotImplementedError("Function picking_communication_event need to be implemented.") + + @abstractmethod + def _picking_torch_op_event(self, **kwargs): + raise NotImplementedError("Function picking_torch_op_event need to be implemented.") + + @abstractmethod + def _picking_kernel_event(self, **kwargs): + raise NotImplementedError("Function picking_kernel_event need to be implemented.") + + @abstractmethod + def _picking_flow_event(self, **kwargs): + raise NotImplementedError("Function picking_flow_event need to be implemented.") + + @abstractmethod + def _get_dispatch_func(self): + raise NotImplementedError("Function _get_dispatch_func need to be implemented.") + + def load_data(self) -> dict: + self._dispatch_events() + self._update_kernel_dict() + self._update_memory_list() + self._update_communication_dict() + if self._enable_profiling_compare: + self._calculate_performance_time() + self._update_overall_metrics() + self._check_result_data() + return {Constant.TORCH_OP: self._torch_op_data, Constant.KERNEL_DICT: self._kernel_dict, + Constant.MEMORY_LIST: self._memory_list, Constant.COMMUNICATION_DICT: self._communication_dict, + Constant.OVERALL_METRICS: self._overall_metrics} + + def _update_communication_dict(self): + pass + + def _dispatch_events(self): + for event in self._trace_events: + if not event.is_dict(): + continue + if event.is_m_mode(): + continue + self.__picking_event(event) + + def __picking_event(self, event: TraceEventBean): + for func in self._dispatch_func: + res = func(event) + if res: + break + + def _update_kernel_dict(self): + for flow_event in self._flow_dict.values(): + start_event = flow_event.get("start") + end_event = flow_event.get("end") + if not start_event or not end_event: + continue + kernel_event = self._all_kernels.get(f"{end_event.pid}-{end_event.tid}-{end_event.start_time}") + if not kernel_event: + continue + self._kernel_dict.setdefault(start_event.start_time, []).append( + KernelEvent(kernel_event.event, self._profiling_type)) + + def _check_result_data(self): + args = ArgsManager() + if args.enable_operator_compare or args.enable_memory_compare: + if not self._torch_op_data: + print(f"[WARNING] Can't find any torch op in the file: {self._profiling_path}") + if args.enable_operator_compare and not self._kernel_dict: + print(f"[WARNING] Can't find any flow event in the file: {self._profiling_path}") + if args.enable_memory_compare and not self._memory_list: + print(f"[WARNING] Can't find any memory event in the file: {self._profiling_path}") + if args.enable_communication_compare and not self._communication_dict: + print(f"[WARNING] Can't find any communication op in the file: {self._profiling_path}") diff --git a/profiler/compare_tools/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/profiling_parser/gpu_profiling_parser.py new file mode 100644 index 0000000000..237eaed208 --- /dev/null +++ b/profiler/compare_tools/profiling_parser/gpu_profiling_parser.py @@ -0,0 +1,215 @@ +from collections import defaultdict, Counter + +from compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from profiling_parser.base_profiling_parser import BaseProfilingParser +from utils.args_manager import ArgsManager +from utils.constant import Constant + + +class OpTimeWarper: + def __init__( + self, + cube_time: float = 0.0, + sdma_time: float = 0.0, + vec_time: float = 0.0, + fa_time_fwd: float = 0.0, + fa_time_bwd: float = 0.0, + all_op_time: float = 0.0, + compute_stream_dur: float = 0.0, + cube_num: int = 0, + vec_num: int = 0, + sdma_num: int = 0, + fa_num_bwd: int = 0, + fa_num_fwd: int = 0 + ): + self.cube_time = cube_time + self.sdma_time = sdma_time + self.vec_time = vec_time + self.fa_time_fwd = fa_time_fwd + self.fa_time_bwd = fa_time_bwd + self.all_op_time = all_op_time + self.compute_stream_dur = compute_stream_dur + self.cube_num = cube_num + self.vec_num = vec_num + self.sdma_num = sdma_num + self.fa_num_bwd = fa_num_bwd + self.fa_num_fwd = fa_num_fwd + + +class GPUProfilingParser(BaseProfilingParser): + NCCL_MARK = 'nccl' + CUBE_MARK = 'gemm' + FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel']] + SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)'] + + def __init__(self, args: any, path_dict: dict): + super().__init__(args, path_dict) + self._trace_events = [TraceEventBean(event) for event in self._trace_events.get("traceEvents", [])] + self._flow_cat = (ArgsManager().args.gpu_flow_cat,) if ArgsManager().args.gpu_flow_cat else ( + "async_gpu", "async_cpu_to_gpu", "ac2g", "async") + self._compute_stream_id = self._infer_compute_stream_id() + self._time_wrapper = OpTimeWarper() + self._marks = defaultdict(int) + + @classmethod + def __is_flash_attention(cls, name: str): + for fa_mark in cls.FA_MARK_LIST: + if not len([1 for mark in fa_mark if mark not in name.lower()]): + return True + return False + + @classmethod + def __is_sdma_time(cls, name: str): + for mark in cls.SDMA_MARK_LIST: + if mark in name.lower(): + return True + return False + + def _update_memory_list(self): + if not self._enable_memory_compare: + return + self._memory_events.sort(key=lambda x: x.start_time) + addr_dict = {} + for memory_event in self._memory_events: + allocate_bytes = memory_event.args.get("Bytes", 0) / Constant.BYTE_TO_KB + record = addr_dict.get(memory_event.args.get("Addr")) + if allocate_bytes > 0: + if record: + self._memory_list.append(record) + addr_dict[memory_event.args.get("Addr")] = {Constant.SIZE: allocate_bytes, + Constant.TS: memory_event.start_time, + Constant.ALLOCATION_TIME: memory_event.start_time} + if allocate_bytes < 0 and record: + if abs(allocate_bytes) == record.get(Constant.SIZE): + record[Constant.RELEASE_TIME] = memory_event.start_time + self._memory_list.append(record) + del addr_dict[memory_event.args.get("Addr")] + + def _calculate_performance_time(self): + for event in self._trace_events: + if event.args and event.args.get('stream') == self._compute_stream_id: + self._time_wrapper.compute_stream_dur += event.dur + if not event.is_valid_event(): + continue + if event.args and event.args.get('stream') == self._compute_stream_id: + if self.__is_sdma_time(event.name): + self._time_wrapper.sdma_time += float(event.dur) + self._time_wrapper.sdma_num += 1 + continue + if event.lower_cat != 'kernel': + continue + if self.NCCL_MARK in event.lower_name: + for timestep in range(int(event.start_time + 1), int(event.end_time + 1)): + self._marks[str(timestep)] += 1 # mark this timestep in communication stream + continue + else: + for timestep in range(int(event.start_time + 1), int(event.end_time + 1)): + self._marks[str(timestep)] += -100 # mark this timestep in compute stream + if self.__is_flash_attention(event.name): + if 'bwd' in event.lower_name: + self._time_wrapper.fa_time_bwd += event.dur + self._time_wrapper.fa_num_bwd += 1 + else: + self._time_wrapper.fa_time_fwd += event.dur + self._time_wrapper.fa_num_fwd += 1 + elif self.CUBE_MARK in event.lower_name: + self._time_wrapper.cube_num += 1 + self._time_wrapper.cube_time += event.dur + else: + self._time_wrapper.vec_num += 1 + self._time_wrapper.vec_time += event.dur + self._time_wrapper.all_op_time += event.dur + + def _update_overall_metrics(self): + self._overall_metrics.compute_time = len([_ for _, value in self._marks.items() if value < 0]) / 10 ** 6 + self._overall_metrics.communication_not_overlapped = len( + [_ for _, value in self._marks.items() if value > 0]) / 10 ** 6 + self._overall_metrics.flash_attention_time_bwd = self._time_wrapper.fa_time_bwd / 10 ** 6 + self._overall_metrics.flash_attention_time_fwd = self._time_wrapper.fa_time_fwd / 10 ** 6 + self._overall_metrics.cube_time = self._time_wrapper.cube_time / 10 ** 6 + self._overall_metrics.vec_time = self._overall_metrics.compute_time - ( + self._time_wrapper.cube_time + self._time_wrapper.fa_time_fwd + self._time_wrapper.fa_time_bwd) / 10 ** 6 + self._overall_metrics.cube_num = self._time_wrapper.cube_num + self._overall_metrics.vec_num = self._time_wrapper.vec_num + self._overall_metrics.sdma_num = self._time_wrapper.sdma_num + self._overall_metrics.fa_num_bwd = self._time_wrapper.fa_num_bwd + self._overall_metrics.fa_num_fwd = self._time_wrapper.fa_num_fwd + self._overall_metrics.sdma_time = self._time_wrapper.sdma_time / 10 ** 6 + self.__parse_e2e_time() + self._overall_metrics.scheduling_time = self._overall_metrics.e2e_time - self._overall_metrics.compute_time - \ + self._overall_metrics.communication_not_overlapped + self.__parse_memory_reserved() + + def _picking_communication_event(self, event: TraceEventBean): + if event.lower_cat == "kernel" and event.lower_name.split("_")[0] == "ncclkernel": + name_list = event.lower_name.split("_") + if len(name_list) > 2: + self._communication_dict.setdefault(name_list[1], {}).setdefault("comm_list", []).append(event.dur) + return True + return False + + def _picking_memory_event(self, event: TraceEventBean): + if event.lower_name == '[memory]' and event.device_id >= 0: + self._memory_events.append(event) + return True + return False + + def _picking_torch_op_event(self, event: TraceEventBean): + if event.lower_cat in ("cpu_op", "user_annotation", "cuda_runtime", "operator"): + self._torch_op_data.append(event.event) + return True + return False + + def _picking_kernel_event(self, event: TraceEventBean): + if event.lower_cat == "kernel" and event.lower_name.split("_")[0] != "ncclkernel": + self._all_kernels[f"{event.pid}-{event.tid}-{event.start_time}"] = event + return True + return False + + def _picking_flow_event(self, event: TraceEventBean): + if event.lower_cat in self._flow_cat: + if event.is_flow_start(): + self._flow_dict.setdefault(event.id, {})["start"] = event + elif event.is_flow_end(): + self._flow_dict.setdefault(event.id, {})["end"] = event + return True + return False + + def __parse_e2e_time(self): + compute_events_timeline = [event for event in self._trace_events if event.args and event.args.get('stream')] + compute_events_timeline = sorted(compute_events_timeline, key=lambda event: event.start_time) + self._overall_metrics.e2e_time = (compute_events_timeline[-1].end_time - compute_events_timeline[ + 0].start_time) / 10 ** 6 + + def __parse_memory_reserved(self): + memories = [event.total_reserved for event in self._memory_events] + if not memories: + print("[INFO] Gpu profiling data doesn't contain memory info.") + return + self._overall_metrics.memory_used = max(memories) / 1024 ** 3 + + def _get_dispatch_func(self): + func_list = [] + if self._enable_memory_compare or self._enable_operator_compare: + func_list.append(self._picking_torch_op_event) + if self._enable_operator_compare: + func_list.append(self._picking_kernel_event) + func_list.append(self._picking_flow_event) + if self._enable_memory_compare or self._enable_profiling_compare: + func_list.append(self._picking_memory_event) + if self._enable_communication_compare: + func_list.append(self._picking_communication_event) + return func_list + + def _infer_compute_stream_id(self): + if not self._enable_profiling_compare: + return -1 + kernel_stream_ids = [] + for event in self._trace_events: + is_kernel_exec_event = event.lower_cat == 'kernel' and self.NCCL_MARK not in event.lower_name + if is_kernel_exec_event and event.stream: + kernel_stream_ids.append(event.stream) + if not kernel_stream_ids: + raise RuntimeError('[ERROR] The profiling data does not contain kernel running data.') + counter = Counter(kernel_stream_ids) + return counter.most_common(1)[0][0] diff --git a/profiler/compare_tools/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/profiling_parser/npu_profiling_parser.py new file mode 100644 index 0000000000..fe34c99fb4 --- /dev/null +++ b/profiler/compare_tools/profiling_parser/npu_profiling_parser.py @@ -0,0 +1,382 @@ +import os +import sys +from collections import defaultdict +from math import ceil + +import pandas as pd + +from compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from profiler.cluster_analyse.common_func.file_manager import FileManager +from profiler.cluster_analyse.common_func.path_manager import PathManager +from profiling_parser.base_profiling_parser import BaseProfilingParser +from utils.constant import Constant +from utils.file_reader import FileReader + + +class NpuInfoWrapper: + def __init__( + self, + e2e_time: int = 0, + compute_time: int = 0, + communication_time: int = 0, + sdma_time: int = 0, + sdma_num: int = 0, + is_cluster: bool = False, + event_wait_sqe: dict = defaultdict(list), + ai_core_dict: dict = defaultdict(list), + event_wait_sqe_res: dict = defaultdict(float), + ai_core_res: dict = defaultdict(float), + ): + self.e2e_time = e2e_time + self.compute_time = compute_time + self.communication_time = communication_time + self.sdma_time = sdma_time + self.sdma_num = sdma_num + self.is_cluster = is_cluster + self.event_wait_sqe = event_wait_sqe + self.ai_core_dict = ai_core_dict + self.event_wait_sqe_res = event_wait_sqe_res + self.ai_core_res = ai_core_res + + +class NPUProfilingParser(BaseProfilingParser): + FLASH_ATTENTION = "flashattention" + ACLNNINPLACE_COPY = "aclnninplacecopy" + TENSORMOVE = "tensormove" + + def __init__(self, args: any, path_dict: dict): + super().__init__(args, path_dict) + self._operator_memory_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "operator_memory.csv") + self._memory_record_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "memory_record.csv") + self._kernel_detail_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "kernel_details.csv") + self._info_json_path = path_dict.get(Constant.INFO_JSON_PATH, "") + self._trace_events = [TraceEventBean(event) for event in self._trace_events] + self._comm_task_list = [] + self._comm_list = [] + self._hccl_pid = None + self._hccl_op_tid_list = [] + self._kernel_pid = None + self._enqueue_dict = {} + self._dequeue_data = [] + self._npu_info_wrapper = NpuInfoWrapper() + self.npu_step_time = 0 + self.parallel_time = 0 + self.aicore_time = 0 + self.min_stream_ts = sys.float_info.max + self.max_stream_ts = sys.float_info.min + self._sdma_sqe = defaultdict(float) + self._sdma_num_cnt = defaultdict(int) + self._dispatch_func = self._get_dispatch_func() + self.__filter_meta_id() + + @staticmethod + def __interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list): + ans = 0 + i = 0 + j = 0 + while i < len(cs_event_wait_sqe_list) and j < len(cs_ai_core_list): + lo = max(cs_event_wait_sqe_list[i][0], cs_ai_core_list[j][0]) + hi = min(cs_event_wait_sqe_list[i][1], cs_ai_core_list[j][1]) + if lo <= hi: + ans += (hi - lo) + if cs_event_wait_sqe_list[i][1] < cs_ai_core_list[j][1]: + i += 1 + else: + j += 1 + return ans + + @classmethod + def __match_cann_memory_data(cls, dequeue_data: list, ts_time: float): + if not dequeue_data: + return None + right = len(dequeue_data) - 1 + left = 0 + while right > left: + mid = left + ceil((right - left) / 2) + if ts_time >= dequeue_data[mid].start_time: + left = mid + else: + right = mid - 1 + end_time = dequeue_data[left].start_time + dequeue_data[left].dur + return dequeue_data[left].corr_id if end_time > ts_time else None + + def _get_dispatch_func(self): + func_list = [] + if self._enable_memory_compare or self._enable_operator_compare: + func_list.append(self._picking_torch_op_event) + if self._enable_operator_compare: + func_list.append(self._picking_kernel_event) + func_list.append(self._picking_flow_event) + if self._enable_memory_compare: + func_list.append(self._picking_task_queue_data) + if self._enable_communication_compare: + func_list.append(self._picking_communication_event) + return func_list + + def _update_memory_list(self): + if not os.path.exists(self._operator_memory_path): + return + memory_data = FileReader.read_csv_file(self._operator_memory_path) + self._dequeue_data.sort(key=lambda x: x.start_time) + for data in memory_data: + if not data.get(Constant.ALLOCATION_TIME, 0): + continue + if "cann::" in data.get("Name", ""): + ts_time = float(data.get(Constant.ALLOCATION_TIME, 0)) + matched_corr_id = self.__match_cann_memory_data(self._dequeue_data, ts_time) + if matched_corr_id is not None: + self._memory_list.append({Constant.SIZE: float(data.get(Constant.SIZE, 0)), + Constant.TS: self._enqueue_dict.get(matched_corr_id, 0), + Constant.NAME: data.get(Constant.NAME, ""), + Constant.ALLOCATION_TIME: float(data.get(Constant.ALLOCATION_TIME, 0)), + Constant.RELEASE_TIME: data.get(Constant.RELEASE_TIME, 0)}) + self._memory_list.append({Constant.SIZE: float(data.get(Constant.SIZE, 0)), + Constant.TS: float(data.get(Constant.ALLOCATION_TIME, 0)), + Constant.ALLOCATION_TIME: float(data.get(Constant.ALLOCATION_TIME, 0)), + Constant.RELEASE_TIME: data.get(Constant.RELEASE_TIME, 0)}) + + def _update_communication_dict(self): + for task_event in self._comm_task_list: + for communication_op in self._comm_list: + if task_event.start_time < communication_op.start_time or \ + task_event.start_time > communication_op.end_time: + continue + name_list = communication_op.lower_name.split("_") + if len(name_list) >= 2: + self._communication_dict.setdefault(name_list[1], {}).setdefault("comm_task", {}).setdefault( + task_event.name, []).append(task_event.dur) + break + + def _calculate_performance_time(self): + self.__parse_info_json() + self.__parse_mem_csv() + self.__parse_kernel_csv() + self.__parse_npu_trace_events() + + def _update_overall_metrics(self): + # AI_CORE和EVENT_WAIT_SQE共存为计算流 + compute_stream = [] + parallel_stream = [] + if not self._npu_info_wrapper.is_cluster: + # 单机单卡没有overlap analysis + if len(self._npu_info_wrapper.ai_core_dict) == 1: + compute_stream.append(min(self._npu_info_wrapper.ai_core_dict.keys())) + elif len(self._npu_info_wrapper.ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) + compute_stream = list( + self._npu_info_wrapper.event_wait_sqe.keys() & self._npu_info_wrapper.ai_core_dict.keys()) + parallel_stream = list(self._npu_info_wrapper.ai_core_dict.keys() - set(compute_stream)) + else: + print('[WARNING] Npu trace json file lack of Stream info') + return + cs_event_wait_sqe_list = self._npu_info_wrapper.event_wait_sqe[compute_stream[0]] + if parallel_stream: + cs_ai_core_list = self._npu_info_wrapper.ai_core_dict[parallel_stream[0]] + sorted(cs_event_wait_sqe_list, key=lambda x: (x[0])) + sorted(cs_ai_core_list, key=lambda x: (x[0])) + self.parallel_time = self.__interval_intersection(cs_event_wait_sqe_list, cs_ai_core_list) + self._overall_metrics.compute_time = self._npu_info_wrapper.compute_time / 10 ** 6 if \ + self._npu_info_wrapper.is_cluster else self._npu_info_wrapper.ai_core_res[compute_stream[0]] / 10 ** 6 + self._overall_metrics.other_time = max(0, self._overall_metrics.compute_time - + self._overall_metrics.cube_time - + self._overall_metrics.flash_attention_time_fwd - + self._overall_metrics.flash_attention_time_bwd - + self._overall_metrics.vec_time) + self._overall_metrics.e2e_time = self._npu_info_wrapper.e2e_time / 10 ** 6 if \ + self._npu_info_wrapper.is_cluster else (self.max_stream_ts - self.min_stream_ts) / 10 ** 6 + self._overall_metrics.communication_not_overlapped = self._npu_info_wrapper.communication_time / 10 ** 6 \ + if self._npu_info_wrapper.is_cluster else (self._npu_info_wrapper.event_wait_sqe_res[compute_stream[0]] + - self.parallel_time) / 10 ** 6 + time_required = self._overall_metrics.compute_time + self._overall_metrics.communication_not_overlapped + self._overall_metrics.sdma_time += self._npu_info_wrapper.sdma_time / 10 ** 6 + self._overall_metrics.sdma_num += self._npu_info_wrapper.sdma_num + if self.npu_step_time: + self._overall_metrics.scheduling_time = self.npu_step_time - time_required + else: + self._overall_metrics.scheduling_time = self._overall_metrics.e2e_time - time_required + + def _picking_communication_event(self, event: TraceEventBean): + if event.pid != self._hccl_pid: + return False + if event.tid in self._hccl_op_tid_list: + name_list = event.lower_name.split("_") + if len(name_list) > 2: + self._comm_list.append(event) + self._communication_dict.setdefault(name_list[1], {}).setdefault("comm_list", []).append(event.dur) + else: + self._comm_task_list.append(event) + return True + + def _picking_torch_op_event(self, event: TraceEventBean): + if event.lower_cat == "cpu_op": + self._torch_op_data.append(event.event) + return True + return False + + def _picking_kernel_event(self, event: TraceEventBean): + if event.pid == self._kernel_pid and event.is_x_mode(): + self._all_kernels[f"{event.pid}-{event.tid}-{event.start_time}"] = event + return True + return False + + def _picking_flow_event(self, event: TraceEventBean): + if event.lower_cat == "async_npu": + if event.is_flow_start(): + self._flow_dict.setdefault(event.id, {})["start"] = event + elif event.is_flow_end(): + self._flow_dict.setdefault(event.id, {})["end"] = event + return True + return False + + def _picking_task_queue_data(self, event: TraceEventBean): + if event.lower_cat == "enqueue": + self._enqueue_dict[event.corr_id] = event.start_time + elif event.lower_cat == "dequeue": + self._dequeue_data.append(event) + + def __filter_meta_id(self): + if not self._enable_communication_compare and not self._enable_operator_compare: + return + for event in self._trace_events: + if event.is_process_meta(): + if event.is_hccl_process_name(): + self._hccl_pid = event.pid + elif event.is_npu_process_name(): + self._kernel_pid = event.pid + if not self._enable_communication_compare: + return + for event in self._trace_events: + if event.is_thread_meta(): + if event.pid != self._hccl_pid: + continue + if event.is_communication_op_thread(): + self._hccl_op_tid_list.append(event.tid) + + def __parse_info_json(self): + if not os.path.exists(self._info_json_path): + return + json_data = FileReader.read_trace_file(self._info_json_path) + if not json_data: + return + if "ProfilerActivity.CPU" in json_data.get('config', {}).get('common_config', {}).get('activities', []): + return + if 'Level0' != json_data.get('config', {}).get('experimental_config', {}).get('_profiler_level', ''): + return + self._overall_metrics.minimal_profiling = True + + def __parse_kernel_csv(self): + if not os.path.exists(self._kernel_detail_path): + print('[WARNING] Npu kernel details csv file is not available.') + return + PathManager.check_path_readable(self._kernel_detail_path) + FileManager.check_file_size(self._kernel_detail_path) + info = pd.read_csv(self._kernel_detail_path, index_col=None) + if info.get('mac_time(us)') is None and info.get('aiv_vec_time(us)') is None: + self._overall_metrics.hide_op_details = True + return + for i in range(len(info['Model ID'])): + op_type = info.loc[i, 'Type'] + name = info.loc[i, 'Name'] + aiv_vec_time = info.loc[i, 'aiv_vec_time(us)'] if info.get('aiv_vec_time(us)') is not None else None + mac_time = info.loc[i, 'mac_time(us)'] if info.get('mac_time(us)') is not None else None + if pd.isna(aiv_vec_time) and pd.isna(mac_time): + continue + task_durations = info.loc[i, 'Duration(us)'] + if self.FLASH_ATTENTION in op_type.lower(): + if 'bwd' in op_type.lower() or 'grad' in op_type.lower(): + self._overall_metrics.flash_attention_time_bwd += task_durations + self._overall_metrics.fa_num_bwd += 1 + else: + self._overall_metrics.flash_attention_time_fwd += task_durations + self._overall_metrics.fa_num_fwd += 1 + elif name.lower().startswith(self.ACLNNINPLACE_COPY) and self.TENSORMOVE in name.lower(): + self._overall_metrics.sdma_time += task_durations + self._overall_metrics.sdma_num += 1 + else: + is_vec = (aiv_vec_time and aiv_vec_time > 0) or (mac_time is not None and mac_time == 0) + if is_vec: + self._overall_metrics.vec_time += task_durations + self._overall_metrics.vec_num += 1 + else: + self._overall_metrics.cube_time += task_durations + self._overall_metrics.cube_num += 1 + + self._overall_metrics.cube_time = self._overall_metrics.cube_time / 10 ** 6 + self._overall_metrics.vec_time = self._overall_metrics.vec_time / 10 ** 6 + self._overall_metrics.flash_attention_time_bwd = self._overall_metrics.flash_attention_time_bwd / 10 ** 6 + self._overall_metrics.flash_attention_time_fwd = self._overall_metrics.flash_attention_time_fwd / 10 ** 6 + self._overall_metrics.sdma_time = self._overall_metrics.sdma_time / 10 ** 6 + + def __parse_mem_csv(self): + if not os.path.exists(self._memory_record_path): + print('[INFO] Npu op memory csv file is not available.') + return + try: + PathManager.check_path_readable(self._memory_record_path) + FileManager.check_file_size(self._memory_record_path) + info = pd.read_csv(self._memory_record_path, usecols=['Total Reserved(MB)'], index_col=None) + except ValueError: + print('[ERROR] Load memory info failed.') + else: + self._overall_metrics.memory_used = max(info.get('Total Reserved(MB)')) / 1024 + + def __parse_npu_trace_events(self): + min_ts = sys.float_info.max + max_ts = sys.float_info.min + for event in self._trace_events: + self.__get_ts_by_task_type(event, self._npu_info_wrapper.event_wait_sqe, + self._npu_info_wrapper.ai_core_dict, + self._npu_info_wrapper.event_wait_sqe_res, self._npu_info_wrapper.ai_core_res) + if event.is_computing_event(): + self._npu_info_wrapper.is_cluster = True + self._npu_info_wrapper.compute_time += event.dur + min_ts = event.start_time if event.start_time < min_ts else min_ts + max_ts = event.end_time if event.end_time > max_ts else max_ts + elif event.is_comm_not_overlap(): + self._npu_info_wrapper.is_cluster = True + self._npu_info_wrapper.communication_time += event.dur + min_ts = event.start_time if event.start_time < min_ts else min_ts + max_ts = event.end_time if event.end_time > max_ts else max_ts + self._npu_info_wrapper.sdma_time, self._npu_info_wrapper.sdma_num = \ + self.__get_sdma_para(self._sdma_sqe, + self._sdma_num_cnt, + self._npu_info_wrapper.ai_core_dict, + self._npu_info_wrapper.event_wait_sqe) + + self._npu_info_wrapper.e2e_time = max_ts - min_ts + + def __get_ts_by_task_type(self, event: TraceEventBean, event_wait_sqe, ai_core_dict, enent_wait_res, ai_core_res): + if event.stream_id: + stream_id = event.stream_id + if event.task_type == 'EVENT_WAIT_SQE': + enent_wait_res[stream_id] += event.dur + event_wait_sqe[stream_id].append([event.start_time, event.end_time]) + elif event.task_type in ('SDMA_SQE', 'PCIE_DMA_SQE'): + self._sdma_sqe[stream_id] += event.dur + self._sdma_num_cnt[stream_id] += 1 + elif event.task_type in ('AI_CORE', 'MIX_AIC', 'MIX_AIV', 'AI_CPU', 'AI_VECTOR_CORE', 'FFTS_PLUS'): + ai_core_res[stream_id] += event.dur + ai_core_dict[stream_id].append([event.start_time, event.end_time]) + self.min_stream_ts = event.start_time if event.start_time < self.min_stream_ts else self.min_stream_ts + self.max_stream_ts = event.end_time if event.end_time > self.max_stream_ts else self.max_stream_ts + + def __get_sdma_para(self, sdma_sqe, sdma_num_cnt, ai_core_dict, event_wait_sqe) -> (float, int): + compute_stream = [] + parallel_stream = [] + sdma_time = 0.0 + sdma_parallel_time = 0.0 + sdma_num = 0 + sdma_parallel_num = 0 + if len(ai_core_dict) == 1: + compute_stream.append(min(ai_core_dict.keys())) + elif len(ai_core_dict) == 2: # 2个ai_core,存在并行流(当前最多2条算子计算流) + compute_stream = list(event_wait_sqe.keys() & ai_core_dict.keys()) + parallel_stream = list(ai_core_dict.keys() - set(compute_stream)) + else: + print('[WARNING] Npu Compute Stream Num Error.') + if parallel_stream: + sdma_parallel_time = sdma_sqe[parallel_stream[0]] + sdma_parallel_num = sdma_num_cnt[parallel_stream[0]] + if compute_stream: + sdma_time = sdma_sqe[compute_stream[0]] + sdma_parallel_time + sdma_num = sdma_num_cnt[compute_stream[0]] + sdma_parallel_num + return sdma_time, sdma_num diff --git a/profiler/compare_tools/utils/args_manager.py b/profiler/compare_tools/utils/args_manager.py index 543e8f60a8..564e7dd1af 100644 --- a/profiler/compare_tools/utils/args_manager.py +++ b/profiler/compare_tools/utils/args_manager.py @@ -3,7 +3,6 @@ import os.path from common_func.path_manager import PathManager from utils.constant import Constant from utils.file_reader import FileReader -from utils.profiling_parser import GPUProfilingParser, NPUProfilingParser class Singleton(object): @@ -19,38 +18,55 @@ class Singleton(object): @Singleton class ArgsManager: - PARSER_DICT = {Constant.NPU: NPUProfilingParser, Constant.GPU: GPUProfilingParser} def __init__(self): self._args = None - self._base_profiling_type = None - self._comparison_profiling_type = None - self._base_profiling = None - self._comparison_profiling = None + self._base_path_dict = {} + self._comparison_path_dict = {} + + @property + def args(self): + return self._args @property def base_profiling_type(self): - return self._base_profiling_type + return self._base_path_dict.get(Constant.PROFILING_TYPE) @property def comparison_profiling_type(self): - return self._comparison_profiling_type + return self._comparison_path_dict.get(Constant.PROFILING_TYPE) @property - def base_profiling(self): - return self._base_profiling + def base_profiling_path(self): + return self._args.base_profiling_path @property - def comparison_profiling(self): - return self._comparison_profiling + def comparison_profiling_path(self): + return self._args.comparison_profiling_path_dict @property - def base_profiling_path(self): - return self._args.base_profiling_path + def base_path_dict(self): + return self._base_path_dict @property - def comparison_profiling_path(self): - return self._args.comparison_profiling_path + def comparison_path_dict(self): + return self._comparison_path_dict + + @property + def enable_profiling_compare(self): + return self._args.enable_profiling_compare + + @property + def enable_operator_compare(self): + return self._args.enable_operator_compare + + @property + def enable_memory_compare(self): + return self._args.enable_memory_compare + + @property + def enable_communication_compare(self): + return self._args.enable_communication_compare @classmethod def check_profiling_path(cls, file_path: str): @@ -77,13 +93,11 @@ class ArgsManager: ascend_output = os.path.join(file_path, "ASCEND_PROFILER_OUTPUT") profiler_output = ascend_output if os.path.isdir(ascend_output) else file_path json_path = os.path.join(profiler_output, "trace_view.json") - memory_path = os.path.join(profiler_output, "operator_memory.csv") if not os.path.isfile(json_path): msg = f"Invalid profiling path: {file_path}" raise RuntimeError(msg) - memory_path = memory_path if os.path.isfile(memory_path) else None return {Constant.PROFILING_TYPE: Constant.NPU, Constant.PROFILING_PATH: file_path, - Constant.TRACE_PATH: json_path, Constant.MEMORY_DATA_PATH: memory_path} + Constant.TRACE_PATH: json_path, Constant.ASCEND_OUTPUT_PATH: profiler_output} def init(self, args: any): self._args = args @@ -106,24 +120,10 @@ class ArgsManager: base_profiling_path = PathManager.get_realpath(self._args.base_profiling_path) self.check_profiling_path(base_profiling_path) - base_profiling_dict = self.parse_profiling_path(base_profiling_path) + self._base_path_dict = self.parse_profiling_path(base_profiling_path) comparison_profiling_path = PathManager.get_realpath(self._args.comparison_profiling_path) self.check_profiling_path(comparison_profiling_path) - comparison_profiling_dict = self.parse_profiling_path(comparison_profiling_path) + self._comparison_path_dict = self.parse_profiling_path(comparison_profiling_path) if self._args.output_path: self.check_output_path(PathManager.get_realpath(self._args.output_path)) - - Constant.BASE_PROFILING = Constant.BASE_PROFILING + self._args.base_profiling_path - self._base_profiling_type = base_profiling_dict.get(Constant.PROFILING_TYPE) - self._base_profiling = self.PARSER_DICT.get(self._base_profiling_type)(self._args, base_profiling_dict) - - if base_profiling_path == comparison_profiling_path: - Constant.COMPARISON_PROFILING = "Same To Base Profiling" - self._comparison_profiling_type = self._base_profiling_type - self._comparison_profiling = self._base_profiling - else: - Constant.COMPARISON_PROFILING = Constant.COMPARISON_PROFILING + self._args.comparison_profiling_path - self._comparison_profiling_type = comparison_profiling_dict.get(Constant.PROFILING_TYPE) - self._comparison_profiling = self.PARSER_DICT.get(self._comparison_profiling_type)(self._args, - comparison_profiling_dict) diff --git a/profiler/compare_tools/utils/common_func.py b/profiler/compare_tools/utils/common_func.py index 9e45d202be..8c5d19908b 100644 --- a/profiler/compare_tools/utils/common_func.py +++ b/profiler/compare_tools/utils/common_func.py @@ -4,3 +4,9 @@ def calculate_diff_ratio(base_value: float, comparison_value: float): else: ratio = float('inf') if not base_value else comparison_value / base_value return [comparison_value - base_value, ratio] + + +def update_order_id(data_list: list): + for index, data in enumerate(data_list): + if data: + data[0] = index + 1 diff --git a/profiler/compare_tools/utils/constant.py b/profiler/compare_tools/utils/constant.py index 7d7f25738f..62f2f3c87b 100644 --- a/profiler/compare_tools/utils/constant.py +++ b/profiler/compare_tools/utils/constant.py @@ -1,6 +1,6 @@ class Constant(object): - GPU = 0 - NPU = 1 + GPU = "GPU" + NPU = "NPU" NA = 'N/A' LIMIT_KERNEL = 3 MAX_PATH_LENGTH = 4096 @@ -22,11 +22,12 @@ class Constant(object): DIR_AUTHORITY = 0o750 PROFILING_TYPE = "profiling type" - ASCEND_OUTPUT_PATH = "ascend output" + # path PROFILING_PATH = "profiling_path" TRACE_PATH = "trace_path" - MEMORY_DATA_PATH = "memory_data_path" + ASCEND_OUTPUT_PATH = "ascend_output" + INFO_JSON_PATH = "info_path" # excel headers BASE_PROFILING = 'Base Profiling: ' @@ -36,12 +37,13 @@ class Constant(object): OPERATOR_COMPARE = "OperatorCompare" MEMORY_COMPARE = "MemoryCompare" - # sheet name - OPERATOR_SHEET = "OperatorCompare" - MEMORY_SHEET = "MemoryCompare" - OPERATOR_TOP_SHEET = "OperatorCompareStatistic" - MEMORY_TOP_SHEET = "MemoryCompareStatistic" - COMMUNICATION_SHEET = "CommunicationCompare" + # table name + OPERATOR_TABLE = "OperatorCompare" + MEMORY_TABLE = "MemoryCompare" + OPERATOR_TOP_TABLE = "OperatorCompareStatistic" + MEMORY_TOP_TABLE = "MemoryCompareStatistic" + COMMUNICATION_TABLE = "CommunicationCompare" + PERFORMANCE_TABLE = "Model Profiling Time Distribution" # memory SIZE = "Size(KB)" @@ -52,3 +54,11 @@ class Constant(object): OP_KEY = "op_name" DEVICE_DUR = "dur" + + BASE_DATA = "base_data" + COMPARISON_DATA = "comparison_data" + OVERALL_METRICS = "overall_metrics" + TORCH_OP = "torch_op" + KERNEL_DICT = "kernel_dict" + MEMORY_LIST = "memory_list" + COMMUNICATION_DICT = "comm_dict" diff --git a/profiler/compare_tools/utils/excel_config.py b/profiler/compare_tools/utils/excel_config.py index 1783b5d81b..123cf31836 100644 --- a/profiler/compare_tools/utils/excel_config.py +++ b/profiler/compare_tools/utils/excel_config.py @@ -1,8 +1,22 @@ from utils.constant import Constant +class CellFormatType: + DEFAULT = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'num_format': '#,##0'} # 数字显示整数,无背景色 + DEFAULT_FLOAT = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'num_format': '#,##0.00'} # 保留2位小数,无背景色 + DEFAULT_RATIO = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', + 'border': True, 'num_format': '0.00%'} # 百分比显示,保留2位小数,无背景色 + RED_RATIO = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', + 'border': True, 'num_format': '0.00%', "fg_color": Constant.RED_COLOR} # 百分比显示,保留2位小数,单元格背景色为红色 + BOLD_STR = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'bold': True} # 字符串,无背景色,字体加粗 + BLUE_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.BLUE_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 蓝色背景,加粗 + + class ExcelConfig(object): - COL_IDS = "ABCDEFGHIJKLMNOPQRSTUVW" ORDER = "Order Id" OPERATOR_NAME = "Operator Name" INPUT_SHAPE = "Input Shape" @@ -34,67 +48,79 @@ class ExcelConfig(object): MIN_DURATION = "Min Duration(us)" HEADERS = { - Constant.OPERATOR_SHEET: [ORDER, OPERATOR_NAME, INPUT_SHAPE, INPUT_TYPE, KERNEL_DETAILS, DEVICE_DURATION, - OPERATOR_NAME, INPUT_SHAPE, INPUT_TYPE, KERNEL_DETAILS, DEVICE_DURATION, DIFF_DUR, - DIFF_RATIO], - Constant.MEMORY_SHEET: [ORDER, OPERATOR_NAME, INPUT_SHAPE, INPUT_TYPE, MEMORY_DETAILS, SIZE, OPERATOR_NAME, - INPUT_SHAPE, INPUT_TYPE, MEMORY_DETAILS, SIZE, DIFF_SIZE, DIFF_RATIO], - Constant.OPERATOR_TOP_SHEET: [TOP, OPERATOR_NAME, BASE_DEVICE_DURATION, BASE_OPERATOR_NUMBER, - COMPARISON_DEVICE_DURATION, COMPARISON_OPERATOR_NUMBER, DIFF_TIME, DIFF_RATIO], - Constant.MEMORY_TOP_SHEET: [TOP, OPERATOR_NAME, BASE_ALLOCATED_TIMES, BASE_ALLOCATED_MEMORY, - BASE_OPERATOR_NUMBER, COMPARISON_ALLOCATED_TIMES, COMPARISON_ALLOCATED_MEMORY, - COMPARISON_OPERATOR_NUMBER, DIFF_MEMORY, DIFF_RATIO], - Constant.COMMUNICATION_SHEET: [ORDER, COMM_OP_NAME, TASK_NAME, CALLS, TOTAL_DURATION, AVG_DURATION, - MAX_DURATION, MIN_DURATION, COMM_OP_NAME, TASK_NAME, CALLS, TOTAL_DURATION, - AVG_DURATION, MAX_DURATION, MIN_DURATION, DIFF_DUR, DIFF_RATIO] + Constant.OPERATOR_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MEMORY_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MEMORY_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MEMORY_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.OPERATOR_TOP_TABLE: [ + {"name": TOP, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": BASE_DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 25}, + {"name": BASE_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": COMPARISON_DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 30}, + {"name": COMPARISON_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": DIFF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MEMORY_TOP_TABLE: [ + {"name": TOP, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": BASE_ALLOCATED_TIMES, "type": CellFormatType.DEFAULT_FLOAT, "width": 25}, + {"name": BASE_ALLOCATED_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 30}, + {"name": BASE_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": COMPARISON_ALLOCATED_TIMES, "type": CellFormatType.DEFAULT_FLOAT, "width": 27}, + {"name": COMPARISON_ALLOCATED_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 33}, + {"name": COMPARISON_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": DIFF_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.COMMUNICATION_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": COMM_OP_NAME, "type": CellFormatType.BOLD_STR, "width": 25}, + {"name": TASK_NAME, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": COMM_OP_NAME, "type": CellFormatType.BOLD_STR, "width": 25}, + {"name": TASK_NAME, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ] } - COLUMNS = {ORDER: 10, OPERATOR_NAME: 30, TOP: 10, BASE_OPERATOR_NUMBER: 25, BASE_DEVICE_DURATION: 25, - COMPARISON_OPERATOR_NUMBER: 30, COMPARISON_DEVICE_DURATION: 30, BASE_ALLOCATED_TIMES: 25, - BASE_ALLOCATED_MEMORY: 30, COMPARISON_ALLOCATED_TIMES: 27, COMPARISON_ALLOCATED_MEMORY: 33, - CALLS: 10, TOTAL_DURATION: 17, AVG_DURATION: 17, MAX_DURATION: 17, MIN_DURATION: 17, COMM_OP_NAME: 25} - - OVERHEAD = {Constant.OPERATOR_SHEET: ["B1:F1", "G1:K1"], Constant.MEMORY_SHEET: ["B1:F1", "G1:K1"], - Constant.COMMUNICATION_SHEET: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_SHEET: ["C1:D1", "E1:F1"], - Constant.MEMORY_TOP_SHEET: ["C1:E1", "F1:H1"]} - - FORMAT = {"int": {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, - 'num_format': '#,##0'}, - "float": {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, - 'num_format': '#,##0.00'}, - "ratio": {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', - 'border': True, 'num_format': '0.00%'}, - "ratio_red": {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', - 'border': True, 'num_format': '0.00%', "fg_color": Constant.RED_COLOR}, - "str_bold": {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, - 'bold': True}} - - FIELD_TYPE_MAP = {ORDER: "int", - OPERATOR_NAME: "str_bold", - INPUT_SHAPE: "int", - INPUT_TYPE: "str", - KERNEL_DETAILS: "int", - MEMORY_DETAILS: "int", - DEVICE_DURATION: "float", - DIFF_RATIO: "ratio", - DIFF_DUR: "float", - DIFF_SIZE: "float", - SIZE: "float", - TOP: "int", - BASE_DEVICE_DURATION: "float", - COMPARISON_DEVICE_DURATION: "float", - BASE_OPERATOR_NUMBER: "int", - COMPARISON_OPERATOR_NUMBER: "int", - DIFF_TIME: "float", - BASE_ALLOCATED_TIMES: "float", - COMPARISON_ALLOCATED_TIMES: "float", - BASE_ALLOCATED_MEMORY: "float", - COMPARISON_ALLOCATED_MEMORY: "float", - DIFF_MEMORY: "float", - COMM_OP_NAME: "str_bold", - TASK_NAME: "int", - CALLS: "int", - TOTAL_DURATION: "float", - AVG_DURATION: "float", - MAX_DURATION: "float", - MIN_DURATION: "float"} + OVERHEAD = {Constant.OPERATOR_TABLE: ["B1:F1", "G1:K1"], Constant.MEMORY_TABLE: ["B1:F1", "G1:K1"], + Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"], + Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"]} diff --git a/profiler/compare_tools/utils/profiling_parser.py b/profiler/compare_tools/utils/profiling_parser.py deleted file mode 100644 index 30dfce4ef8..0000000000 --- a/profiler/compare_tools/utils/profiling_parser.py +++ /dev/null @@ -1,300 +0,0 @@ -from abc import abstractmethod -from math import ceil - -from utils.compare_event import KernelEvent -from utils.constant import Constant -from utils.file_reader import FileReader -from utils.trace_event_data import TraceEventData - - -class ProfilingParser: - def __init__(self, args: any, path_dict: dict): - self._args = args - self._profiling_path = path_dict.get(Constant.PROFILING_PATH) - self._torch_op_data = None - self._kernel_dict = None - self._memory_list = None - self._communication_data = None - self._communication_task_data = None - - @property - def file_path(self) -> str: - return self._profiling_path - - @property - def json_path(self) -> str: - return self._json_path - - @property - def torch_op_data(self) -> list: - if self._torch_op_data is None: - self.get_torch_op_data() - return self._torch_op_data - - @property - def kernel_dict(self) -> dict: - if self._kernel_dict is None: - self.get_kernel_dict() - return self._kernel_dict - - @property - def memory_list(self) -> list: - if self._memory_list is None: - self.get_memory_list() - return self._memory_list - - @property - def communication_data(self) -> dict: - if self._communication_data is None: - self.get_communication_data() - return self._communication_data - - @property - def communication_task_data(self) -> dict: - if self._communication_task_data is None: - self.get_communication_data() - return self._communication_task_data - - @abstractmethod - def get_torch_op_data(self): - raise NotImplementedError - - @abstractmethod - def get_kernel_dict(self): - raise NotImplementedError - - @abstractmethod - def get_memory_list(self): - raise NotImplementedError - - -class GPUProfilingParser(ProfilingParser): - def __init__(self, args: any, path_dict: dict): - super().__init__(args, path_dict) - self._json_path = path_dict.get(Constant.PROFILING_PATH) - - def get_torch_op_data(self): - torch_op_list = [] - json_data = FileReader.read_trace_file(self._json_path) - total_events = json_data.get("traceEvents", []) - for event in total_events: - if event.get("cat", "").lower() in ("cpu_op", "user_annotation", "cuda_runtime", "Operator"): - torch_op_list.append(event) - self._torch_op_data = torch_op_list - - def get_kernel_dict(self): - flow_kernel_dict = {} - json_data = FileReader.read_trace_file(self._json_path) - total_events = json_data.get("traceEvents", []) - flow_cat = (self._args.gpu_flow_cat,) if self._args.gpu_flow_cat else ("async_gpu", "async_cpu_to_gpu", - "ac2g", "async") - flow_start_dict, flow_end_dict, kernel_dict = {}, {}, {} - for event in total_events: - if event.get("cat", "") in flow_cat and event.get("ph") == "s": - flow_start_dict[event.get("id")] = event - elif event.get("cat", "") in flow_cat and event.get("ph") == "f": - flow_end_dict[event.get("id")] = event - elif event.get("cat", "").lower() == "kernel" and event.get("name", "").split("_")[0].lower() != "ncclkernel": - kernel_dict["{}-{}-{}".format(event.get("pid"), event.get("tid"), float(event.get("ts")))] = event - - for flow_id, start_flow in flow_start_dict.items(): - end_flow = flow_end_dict.get(flow_id) - if end_flow is None: - continue - kernel_event = kernel_dict.get( - "{}-{}-{}".format(end_flow.get("pid"), end_flow.get("tid"), float(end_flow.get("ts")))) - if kernel_event is None: - continue - flow_kernel_dict.setdefault(float(start_flow.get("ts")), []).append(KernelEvent(kernel_event, Constant.GPU)) - self._kernel_dict = flow_kernel_dict - - def get_memory_list(self): - self._memory_list = [] - memory_events = [] - json_data = FileReader.read_trace_file(self._json_path) - total_events = json_data.get("traceEvents", []) - for event in total_events: - if event.get("name", "").lower() == "[memory]": - memory_events.append(event) - memory_events.sort(key=lambda x: float(x.get("ts", 0))) - addr_dict = {} - for memory_event in memory_events: - args = memory_event.get("args", {}) - if args.get("Device Type", -1) != 1: - continue - allocate_bytes = args.get("Bytes", 0) / Constant.BYTE_TO_KB - record = addr_dict.get(args.get("Addr")) - if allocate_bytes > 0: - if record: - self._memory_list.append(record) - addr_dict[args.get("Addr")] = {Constant.SIZE: allocate_bytes, - Constant.TS: float(memory_event.get("ts", 0)), - Constant.ALLOCATION_TIME: float(memory_event.get("ts", 0))} - if allocate_bytes < 0 and record: - if abs(allocate_bytes) == record.get(Constant.SIZE): - record[Constant.RELEASE_TIME] = float(memory_event.get("ts", 0)) - self._memory_list.append(record) - del addr_dict[args.get("Addr")] - - def get_communication_data(self): - self._communication_data, self._communication_task_data = [], {} - json_data = FileReader.read_trace_file(self._json_path) - total_events = json_data.get("traceEvents", []) - for data in total_events: - if data.get("cat", "").lower() == "kernel" and data.get("name", "").split("_")[0].lower() == "ncclkernel": - self._communication_data.append(data) - - -class NPUProfilingParser(ProfilingParser): - def __init__(self, args: any, path_dict: str): - super().__init__(args, path_dict) - self._json_path = path_dict.get(Constant.TRACE_PATH) - self._memory_data_path = path_dict.get(Constant.MEMORY_DATA_PATH) - - def get_torch_op_data(self): - torch_op_list = [] - json_data = FileReader.read_trace_file(self._json_path) - for event in json_data: - if event.get("cat", "").lower() == "cpu_op": - torch_op_list.append(event) - self._torch_op_data = torch_op_list - - def get_kernel_dict(self): - flow_kernel_dict = {} - json_data = FileReader.read_trace_file(self._json_path) - flow_cat = "async_npu" - - flow_start_dict, flow_end_dict, kernel_dict = {}, {}, {} - for event in json_data: - if event.get("cat", "") == flow_cat and event.get("ph") == "s": - flow_start_dict[event.get("id")] = event - elif event.get("cat", "") == flow_cat and event.get("ph") == "f": - flow_end_dict[event.get("id")] = event - elif event.get("ph") == "X" and event.get("cat", "") != 'cpu_op': - kernel_dict["{}-{}-{}".format(event.get("pid"), event.get("tid"), float(event.get("ts")))] = event - - for flow_id, start_flow in flow_start_dict.items(): - end_flow = flow_end_dict.get(flow_id) - if end_flow is None: - continue - kernel_event = kernel_dict.get( - "{}-{}-{}".format(end_flow.get("pid"), end_flow.get("tid"), float(end_flow.get("ts")))) - if kernel_event is None: - continue - flow_kernel_dict.setdefault(float(start_flow.get("ts")), []).append(KernelEvent(kernel_event, Constant.NPU)) - self._kernel_dict = flow_kernel_dict - - def get_memory_list(self): - self._memory_list = [] - enqueue_dict, dequeue_data = {}, [] - json_data = FileReader.read_trace_file(self._json_path) - for data in json_data: - if data.get("cat", "").lower() == "enqueue": - enqueue_dict[data.get("args", {}).get("correlation_id", "")] = data - elif data.get("cat", "").lower() == "dequeue": - dequeue_data.append(data) - - if not self._memory_data_path: - return - memory_data = FileReader.read_csv_file(self._memory_data_path) - for data in memory_data: - if not data.get(Constant.ALLOCATION_TIME, 0): - continue - if "cann::" in data.get("Name", ""): - ts_time = float(data.get(Constant.ALLOCATION_TIME, 0)) - match_dequeue_data = self._match_cann_memory_data(dequeue_data, ts_time) - if match_dequeue_data is not None: - correlation_id = match_dequeue_data.get("args", {}).get("correlation_id", "") - ts = float(enqueue_dict.get(correlation_id, {}).get("ts", 0)) - self._memory_list.append({Constant.SIZE: float(data.get(Constant.SIZE, 0)), Constant.TS: ts, - Constant.NAME: data.get(Constant.NAME, ""), - Constant.ALLOCATION_TIME: float(data.get(Constant.ALLOCATION_TIME, 0)), - Constant.RELEASE_TIME: data.get(Constant.RELEASE_TIME, 0)}) - self._memory_list.append({Constant.SIZE: float(data.get(Constant.SIZE, 0)), - Constant.TS: float(data.get(Constant.ALLOCATION_TIME, 0)), - Constant.ALLOCATION_TIME: float(data.get(Constant.ALLOCATION_TIME, 0)), - Constant.RELEASE_TIME: data.get(Constant.RELEASE_TIME, 0)}) - - @classmethod - def _match_cann_memory_data(cls, dequeue_data: list, ts_time: float): - if not dequeue_data: - return None - right = len(dequeue_data) - 1 - left = 0 - while right > left: - mid = left + ceil((right - left) / 2) - if ts_time >= float(dequeue_data[mid].get("ts", 0)): - left = mid - else: - right = mid - 1 - end_time = float(dequeue_data[left].get("ts", 0)) + dequeue_data[left].get("dur", 0) - return dequeue_data[left] if end_time > ts_time else None - - def get_communication_data(self): - def get_pid(json_data): - pid = None - for data in json_data: - trace_event = TraceEventData(data) - if not trace_event.is_process_meta(): - continue - if trace_event.is_hccl_process(): - pid = trace_event.pid - break - return pid - - def get_tid_list(pid, tid_list, json_data): - for data in json_data: - trace_event = TraceEventData(data) - if not trace_event.is_thread_meta(): - continue - if trace_event.pid != pid: - continue - if trace_event.is_communication_op_thread(): - tid_list.append(trace_event.tid) - - def get_comm_data(pid, tid_list, json_data): - for data in json_data: - trace_event = TraceEventData(data) - if not trace_event.is_x_mode(): - continue - if trace_event.pid != pid: - continue - if trace_event.tid in tid_list: - self._communication_data.append(data) - - def get_comm_task_data(pid, tid_list, json_data): - for data in json_data: - trace_event = TraceEventData(data) - if not trace_event.is_x_mode(): - continue - if trace_event.pid != pid: - continue - if trace_event.tid in tid_list: - continue - ts = trace_event.start_time - for communication_op in self._communication_data: - comm_op_event = TraceEventData(communication_op) - if ts < comm_op_event.start_time or ts > comm_op_event.end_time: - continue - name_list = communication_op.get("name", "").split("_") - if len(name_list) >= 2: - self._communication_task_data.setdefault(name_list[1].lower(), []).append(data) - break - - self._communication_data, self._communication_task_data = [], {} - json_data = FileReader.read_trace_file(self._json_path) - - pid = get_pid(json_data) - if pid is None: - return - - tid_list = [] - get_tid_list(pid, tid_list, json_data) - if not tid_list: - return - - get_comm_data(pid, tid_list, json_data) - if not self._communication_data: - return - - get_comm_task_data(pid, tid_list, json_data) diff --git a/profiler/compare_tools/utils/torch_op_node.py b/profiler/compare_tools/utils/torch_op_node.py index c62526c766..56f5f6c2ad 100644 --- a/profiler/compare_tools/utils/torch_op_node.py +++ b/profiler/compare_tools/utils/torch_op_node.py @@ -1,6 +1,6 @@ from math import ceil -from utils.compare_event import MemoryEvent +from compare_bean.origin_data_bean.compare_event import MemoryEvent from utils.constant import Constant diff --git a/profiler/compare_tools/utils/trace_event_data.py b/profiler/compare_tools/utils/trace_event_data.py deleted file mode 100644 index ff70b230e7..0000000000 --- a/profiler/compare_tools/utils/trace_event_data.py +++ /dev/null @@ -1,42 +0,0 @@ -class TraceEventData: - - def __init__(self, event: dict): - self._event = event - - @property - def pid(self) -> int: - return self._event.get("pid", "") - - @property - def tid(self) -> int: - return self._event.get("tid", "") - - @property - def process_name(self) -> int: - return self._event.get("args", {}).get("name", "") - - @property - def start_time(self) -> float: - return float(self._event.get("ts", 0)) - - @property - def end_time(self) -> float: - return float(self._event.get("ts", 0)) + self._event.get("dur", 0) - - def is_m_mode(self) -> bool: - return self._event.get("ph", "") == "M" - - def is_x_mode(self) -> bool: - return self._event.get("ph", "") == "X" - - def is_process_meta(self) -> bool: - return self.is_m_mode() and self._event.get("name", "") == "process_name" - - def is_thread_meta(self) -> bool: - return self.is_m_mode() and self._event.get("name", "") == "thread_name" - - def is_communication_op_thread(self) -> bool: - return self._event.get("args", {}).get("name", "").find("Communication") != -1 - - def is_hccl_process(self) -> bool: - return self.process_name == "HCCL" diff --git a/profiler/compare_tools/view/base_view.py b/profiler/compare_tools/view/base_view.py new file mode 100644 index 0000000000..d18980b7de --- /dev/null +++ b/profiler/compare_tools/view/base_view.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod + + +class BaseView(ABC): + def __init__(self, data_dict: dict): + self._data_dict = data_dict + + @abstractmethod + def generate_view(self): + raise NotImplementedError("Function generate_view need to be implemented.") diff --git a/profiler/compare_tools/view/excel_view.py b/profiler/compare_tools/view/excel_view.py index 457012bbc8..864a136a3f 100644 --- a/profiler/compare_tools/view/excel_view.py +++ b/profiler/compare_tools/view/excel_view.py @@ -2,19 +2,21 @@ import os from xlsxwriter import Workbook +from view.base_view import BaseView from view.work_sheet_creator import WorkSheetCreator from utils.constant import Constant -class ExcelViewer: +class ExcelView(BaseView): - def __init__(self, data_dict: dict, file_path: str): - self._data_dict = data_dict + def __init__(self, data_dict: dict, file_path: str, args: any): + super().__init__(data_dict) self._file_path = file_path + self._args = args def generate_view(self): workbook = Workbook(self._file_path) for sheet_name, data in self._data_dict.items(): - WorkSheetCreator(workbook, sheet_name, data).create_sheet() + WorkSheetCreator(workbook, sheet_name, data, self._args).create_sheet() workbook.close() os.chmod(self._file_path, Constant.FILE_AUTHORITY) diff --git a/profiler/compare_tools/view/screen_view.py b/profiler/compare_tools/view/screen_view.py new file mode 100644 index 0000000000..9c256ac3ab --- /dev/null +++ b/profiler/compare_tools/view/screen_view.py @@ -0,0 +1,19 @@ +from prettytable import PrettyTable + +from view.base_view import BaseView + + +class ScreenView(BaseView): + def __init__(self, data_dict: dict): + super().__init__(data_dict) + + def generate_view(self): + for sheet_name, data in self._data_dict.items(): + if not data.get("rows", []): + return + table = PrettyTable() + table.title = sheet_name + table.field_names = data.get("headers", []) + for row in data.get("rows", []): + table.add_row(row) + print(table) diff --git a/profiler/compare_tools/view/work_sheet_creator.py b/profiler/compare_tools/view/work_sheet_creator.py index 909d444531..ef7f8deedd 100644 --- a/profiler/compare_tools/view/work_sheet_creator.py +++ b/profiler/compare_tools/view/work_sheet_creator.py @@ -1,67 +1,52 @@ from xlsxwriter import Workbook -from utils.args_manager import ArgsManager -from utils.constant import Constant -from utils.excel_config import ExcelConfig +from utils.excel_config import ExcelConfig, CellFormatType class WorkSheetCreator: - def __init__(self, work_book: Workbook, sheet_name: str, data: list): + def __init__(self, work_book: Workbook, sheet_name: str, data: dict, args: any): self._work_book = work_book self._sheet_name = sheet_name self._data = data + self._args = args self._work_sheet = None self._row_id = 1 + self._field_format = {} + self._diff_ratio_index = None + self._col_ids = "ABCDEFGHIJKLMNOPQRSTUVW" def create_sheet(self): + if not self._data.get("rows", []): + return self._work_sheet = self._work_book.add_worksheet(self._sheet_name) self._write_headers() self._write_data() def _write_headers(self): - header_format = self._work_book.add_format( - {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.BLUE_COLOR, 'align': 'left', - 'valign': 'vcenter', 'bold': True, 'border': True}) - headers = ExcelConfig.HEADERS.get(self._sheet_name, []) - overhead = ExcelConfig.OVERHEAD.get(self._sheet_name, []) + header_format = self._work_book.add_format(CellFormatType.BLUE_BOLD) + overhead = self._data.get("overhead", []) if overhead: - base_path = f"Base Profiling: {ArgsManager().base_profiling_path}" + base_path = f"Base Profiling: {self._args.base_profiling_path}" self._work_sheet.merge_range(overhead[0], base_path, header_format) - if ArgsManager().base_profiling_path == ArgsManager().comparison_profiling_path: - comparison_path = "Same To Base Profiling" - else: - comparison_path = f"Comparison Profiling: {ArgsManager().comparison_profiling_path}" + comparison_path = f"Comparison Profiling: {self._args.comparison_profiling_path}" self._work_sheet.merge_range(overhead[1], comparison_path, header_format) self._row_id += 2 - for index, header in enumerate(headers): - column_width = ExcelConfig.COLUMNS.get(header, 20) - col_id = ExcelConfig.COL_IDS[index] - self._work_sheet.set_column(f"{col_id}:{col_id}", column_width) - self._work_sheet.write(f"{col_id}{self._row_id}", header, header_format) + for index, header in enumerate(self._data.get("headers")): + col_id = self._col_ids[index] + self._work_sheet.set_column(f"{col_id}:{col_id}", header.get("width")) + self._work_sheet.write(f"{col_id}{self._row_id}", header.get("name"), header_format) + self._field_format[index] = self._work_book.add_format(header.get("type")) + if header.get("name") == ExcelConfig.DIFF_RATIO: + self._diff_ratio_index = index self._row_id += 1 def _write_data(self): - default_format = self._work_book.add_format(ExcelConfig.FORMAT.get("int")) - red_ratio_format = self._work_book.add_format(ExcelConfig.FORMAT.get("ratio_red")) - headers = ExcelConfig.HEADERS.get(self._sheet_name, []) - field_format = {} - diff_ratio_index = None - for index, header in enumerate(headers): - format_dict = ExcelConfig.FORMAT.get(ExcelConfig.FIELD_TYPE_MAP.get(header, "int")) - if not format_dict: - format_dict = ExcelConfig.FORMAT.get("int") - field_format[index] = self._work_book.add_format(format_dict) - if header == ExcelConfig.DIFF_RATIO: - diff_ratio_index = index - order_id = 1 - for data in self._data: - self._work_sheet.write(f"{ExcelConfig.COL_IDS[0]}{self._row_id}", order_id, default_format) + red_ratio_format = self._work_book.add_format(CellFormatType.RED_RATIO) + for data in self._data.get("rows"): for index, cell_data in enumerate(data): - data_index = index + 1 - cell_format = field_format.get(data_index, default_format) - if data_index == diff_ratio_index and cell_data and cell_data > 1: + cell_format = self._field_format.get(index) + if index == self._diff_ratio_index and cell_data and cell_data > 1: cell_format = red_ratio_format cell_data = "INF" if cell_data == float('inf') else cell_data - self._work_sheet.write(f"{ExcelConfig.COL_IDS[data_index]}{self._row_id}", cell_data, cell_format) - order_id += 1 + self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format) self._row_id += 1 -- Gitee