From c46f7fb3eeab4e1768380d2e12ef7ec4badc3141 Mon Sep 17 00:00:00 2001 From: starmountain1997 Date: Wed, 10 Jul 2024 10:16:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=80=A7=E8=83=BD=E6=8B=86=E8=A7=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../comparator/base_comparator.py | 2 +- .../comparator/overall_metrics_comparator.py | 138 ++++++++++++++++++ .../origin_data_bean/kernel_details_bean.py | 34 ++++- .../origin_data_bean/trace_event_bean.py | 42 +++++- .../compare_bean/overall_metrics_bean.py | 20 +++ .../compare_bean/profiling_info.py | 126 ++++++++++++++++ .../generator/detail_performance_generator.py | 15 +- .../profiling_parser/base_profiling_parser.py | 71 +++++++++ .../profiling_parser/gpu_profiling_parser.py | 16 +- .../profiling_parser/npu_profiling_parser.py | 28 ++-- .../compare_backend/utils/constant.py | 11 +- .../compare_backend/utils/excel_config.py | 32 +++- .../view/work_sheet_creator.py | 28 +++- 13 files changed, 533 insertions(+), 30 deletions(-) create mode 100644 profiler/compare_tools/compare_backend/comparator/overall_metrics_comparator.py create mode 100644 profiler/compare_tools/compare_backend/compare_bean/overall_metrics_bean.py diff --git a/profiler/compare_tools/compare_backend/comparator/base_comparator.py b/profiler/compare_tools/compare_backend/comparator/base_comparator.py index 330fb871ee..8012dfae94 100644 --- a/profiler/compare_tools/compare_backend/comparator/base_comparator.py +++ b/profiler/compare_tools/compare_backend/comparator/base_comparator.py @@ -21,4 +21,4 @@ class BaseComparator(ABC): @abstractmethod def _compare(self): - raise NotImplementedError("Function _compare need to be implemented.") + raise NotImplementedError("Function _compare need to be implemented.") \ No newline at end of file diff --git a/profiler/compare_tools/compare_backend/comparator/overall_metrics_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_metrics_comparator.py new file mode 100644 index 0000000000..9a9a6bb956 --- /dev/null +++ b/profiler/compare_tools/compare_backend/comparator/overall_metrics_comparator.py @@ -0,0 +1,138 @@ +from typing import Dict, Any, List + +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.constant import Constant + + +class OverallMetricsComparator(BaseComparator): + def __init__(self, origin_data: Dict, bean: Any): + super().__init__(origin_data, bean) + self._row_style = bean.ROW_STYLE + + @property + def base_info(self): + return self._origin_data.get(Constant.BASE_DATA) + + @property + def comp_info(self): + return self._origin_data.get(Constant.COMPARISON_DATA) + + def generate_data(self) -> Dict: + self._compare() + return {self._sheet_name: { + "headers": self._headers, + "rows": self._rows, + "overhead": self._overhead, + "row_style": self._row_style + }} + + def _compare(self): + if self.base_info.e2e_time_ms == 0 or self.comp_info.e2e_time_ms == 0: + return + row_headers = [ + "\tFlash Attention", + "\t\tFlash Attention (Forward) (Cube)", "\t\tFlash Attention (Backward) (Cube)", + "\t\tFlash Attention (Forward) (Vector)", "\t\tFlash Attention (Backward) (Vector)", + ] + time_attrs = ["fa_time_fwd_cube", "fa_time_bwd_cube", "fa_time_fwd_vector", "fa_time_bwd_vector"] + num_attrs = ["fa_num_fwd_cube", "fa_num_bwd_cube", "fa_num_fwd_vector", "fa_num_bwd_vector"] + flash_attention_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + row_headers = ["\tConv", + "\t\tConv (Forward) (Cube)", "\t\tConv (Backward) (Cube)", + "\t\tConv (Forward) (Vector)", "\t\tConv (Backward) (Vector)", ] + time_attrs = ["conv_time_fwd_cube", "conv_time_bwd_cube", "conv_time_fwd_vector", "conv_time_bwd_vector"] + num_attrs = ["conv_num_fwd_cube", "conv_num_bwd_cube", "conv_num_fwd_vector", "conv_num_bwd_vector"] + conv_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + row_headers = ["\tMatmul", "\t\tMatmul (Cube)", "\t\tMatmul (Vector)"] + time_attrs = ["matmul_time_cube", "matmul_time_vector"] + num_attrs = ["matmul_num_cube", "matmul_num_vector"] + matmul_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + row_headers = ["\tVector", "\t\tVector (trans)", "\t\tVector (no trans)"] + time_attrs = ["vector_time_trans", "vector_time_notrans"] + num_attrs = ["vector_num_trans", "vector_num_notrans"] + vector_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + row_headers = ["\tSDMA"] + time_attrs = ["sdma_time_tensor_move"] + num_attrs = ["sdma_num_tensor_move"] + sdma_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + row_headers = ["\tOther"] + time_attrs = ["other_time_ms"] + num_attrs = [None] + other_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + row_headers = ["Uncovered Communication Time", "\tWait", "\tTransmit"] + time_attrs = ["wait_time_ms", "transmit_time_ms"] + num_attrs = [None, None] + uncovered_communication_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + row_headers = ["Free Time", "\tSDMA", "\tFree"] + time_attrs = ["sdma_time_stream", "scheduling_time_ms"] + num_attrs = ["sdma_num_stream", None] + free_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + row_headers = ["E2E Time"] + time_attrs = ["e2e_time_ms"] + num_attrs = [None] + e2e_data = self._cal_row_data(row_headers, time_attrs, num_attrs) + + computing_data_source = (flash_attention_data, conv_data, matmul_data, vector_data, sdma_data, other_data) + computing_time_data = ["Computing Time"] + self._cal_sum( + [s[0][1] for s in computing_data_source], + [s[0][3] for s in computing_data_source], + [s[0][4] for s in computing_data_source], + [s[0][6] for s in computing_data_source], + ) + self._rows = ([computing_time_data] + flash_attention_data + conv_data + matmul_data + + vector_data + sdma_data + other_data + uncovered_communication_data + + free_data + e2e_data) + + def _cal_sum(self, + base_info_times: List[float], base_info_nums: List[int], + comp_info_times: List[float], comp_info_nums: List[int], + ): + data_sum = [ + sum(base_info_times), + 0, + sum(base_info_nums), + sum(comp_info_times), + 0, + sum(comp_info_nums), + 0, 0 + ] + data_sum[1] = data_sum[0] / self.base_info.e2e_time_ms + data_sum[4] = data_sum[3] / self.comp_info.e2e_time_ms + data_sum[6] = data_sum[3] - data_sum[0] + data_sum[7] = data_sum[3] / data_sum[0] if data_sum[0] != 0 else 0 + return data_sum + + def _cal_row_data(self, row_headers: List[str], time_attrs: List[str], num_attrs: List[str]): + if len(row_headers) == len(time_attrs) + 1: # 需要 sum up + data_sum = row_headers[:1] + self._cal_sum( + [getattr(self.base_info, a) for a in time_attrs], + [getattr(self.base_info, a) if a else 0 for a in num_attrs], + [getattr(self.comp_info, a) for a in time_attrs], + [getattr(self.comp_info, a) if a else 0 for a in num_attrs], + ) + row_headers = row_headers[1:] + else: + data_sum = [] + + data = [ + row_headers, + [getattr(self.base_info, a) for a in time_attrs], + [getattr(self.base_info, a) / self.base_info.e2e_time_ms for a in time_attrs], + [getattr(self.base_info, a) if a else 0 for a in num_attrs], + [getattr(self.comp_info, a) for a in time_attrs], + [getattr(self.comp_info, a) / self.comp_info.e2e_time_ms for a in time_attrs], + [getattr(self.comp_info, a) if a else 0 for a in num_attrs], + ] + data = [list(i) for i in zip(*data)] # 行列转置 + for i in range(len(time_attrs)): + data[i].append(data[i][4] - data[i][1]) + data[i].append(data[i][4] / data[i][1] if data[i][1] != 0 else 0) + return [data_sum] + data if data_sum else data diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py index 122009b904..070fcdfd71 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py +++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py @@ -1,8 +1,9 @@ import math +from decimal import Decimal import pandas as pd -from compare_backend.utils.common_func import convert_to_float +from compare_backend.utils.common_func import convert_to_float, convert_to_decimal from compare_backend.utils.constant import Constant @@ -12,8 +13,10 @@ class KernelDetailsBean: self._op_type = "" self._name = "" self._aiv_vec_time = 0.0 + self._aicore_time = 0.0 self._mac_time = 0.0 self._duration = 0.0 + self._start_time = Decimal("0") self.init() @property @@ -30,6 +33,12 @@ class KernelDetailsBean: return float("nan") return convert_to_float(self._aiv_vec_time) + @property + def aicore_time(self) -> float: + if self._aicore_time == "" or self._aicore_time == "N/A": + return float("nan") + return convert_to_float(self._aicore_time) + @property def mac_time(self) -> float: if self._mac_time == "" or self._mac_time == "N/A": @@ -40,6 +49,18 @@ class KernelDetailsBean: def duration(self) -> float: return convert_to_float(self._duration) + @property + def dur(self) -> float: + return convert_to_float(self._duration) + + @property + def start_time(self) -> Decimal: + return convert_to_decimal(self._start_time) + + @property + def end_time(self) -> Decimal: + return self.start_time + convert_to_decimal(self._duration) + def is_hide_op_pmu(self): if "mac_time(us)" in self._data.keys() or "aiv_vec_time(us)" in self._data.keys(): return False @@ -79,9 +100,20 @@ class KernelDetailsBean: def is_page_attention(self): return "pagedattention" in self.op_type.lower() + def is_trans(self): + return any(trans_mask in self.name.lower() for trans_mask in Constant.KERNEL_TRANS_MASK) + + def is_cube_categorize_performance(self): + return self.mac_time > 0 or self.aicore_time > 0 + + def is_vector_for_categorize_performance(self): + return not self.is_sdma() and self.aiv_vec_time > 0 + def init(self): self._op_type = self._data.get('Type', "") self._name = self._data.get('Name', "") self._aiv_vec_time = self._data.get('aiv_vec_time(us)', "") + self._aicore_time = self._data.get("aicore_time(us)", "") self._mac_time = self._data.get('mac_time(us)', "") self._duration = self._data.get('Duration(us)', 0) + self._start_time = Decimal(self._data.get("Start Time(us)", "0")) diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py index cef6bb0712..65506f8c17 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py +++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py @@ -199,11 +199,51 @@ class TraceEventBean: self._name = name def is_conv(self): - return self.name.lower().startswith("aten::conv") + return self.lower_name.startswith("aten::conv") def is_lccl(self): return self.lower_name == "kernel_aivec" + def is_fa_for_cpu_op(self) -> bool: + """ + 这个类在cpu op和gpu中均有用到,这里是在cpu op阶段判断 + """ + return any(cube_mask in self.lower_name for cube_mask in Constant.CPU_OP_FA_MASK) + + def is_conv_for_cpu_op(self) -> bool: + """ + 这个类在cpu op和gpu中均有用到,这里是在cpu op阶段判断 + """ + return self.lower_name.startswith(Constant.CPU_OP_CONV_MASK) + + def is_matmul_for_cpu_op(self) -> bool: + """ + 这个类在cpu op和gpu中均有用到,这里是在cpu op阶段判断 + """ + return any(matmul_mask in self.lower_name for matmul_mask in Constant.CPU_OP_MATMUL_MASK) + + def is_bwd_for_cpu_op(self) -> bool: + """ + 这个类在cpu op和gpu中均有用到,这里是在cpu op阶段判断 + """ + return any(bwd_mask in self.lower_name for bwd_mask in Constant.BWD_LIST) + + def is_cube_categorize_performance(self): + return any(cube_mask in self.lower_name for cube_mask in Constant.KERNEL_CUBE_MASK) + + @staticmethod + def is_trans() -> bool: + """ + 暂时没找到GPU判断trans的方法,暂时都是notrans + """ + return False + + def is_cpu_op(self) -> bool: + return self.is_matmul_for_cpu_op() or self.is_fa_for_cpu_op() or self.is_conv_for_cpu_op() + + def is_vector_for_categorize_performance(self): + return not self.is_sdma_event() and not self.is_nccl_name() + def init(self): if isinstance(self._event, dict): self._pid = self._event.get("pid", 0) diff --git a/profiler/compare_tools/compare_backend/compare_bean/overall_metrics_bean.py b/profiler/compare_tools/compare_backend/compare_bean/overall_metrics_bean.py new file mode 100644 index 0000000000..1b3872be52 --- /dev/null +++ b/profiler/compare_tools/compare_backend/compare_bean/overall_metrics_bean.py @@ -0,0 +1,20 @@ +from typing import List + +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class OverallMetricsBean: + TABLE_NAME = Constant.OVERALL_METRICS_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + ROW_STYLE = ExcelConfig.ROW_STYLE.get(TABLE_NAME) + + def __init__(self, name: str, base_data: List, comparison_data: List): + self._name = name + self._base_info = OverallMetricsInfo(base_data) + self._comparison = OverallMetricsInfo(comparison_data) + +class OverallMetricsInfo: + def __init__(self, data_list: List): + self._data_list = data_list \ No newline at end of file diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py index e5d9bf26e9..3c566dbcb9 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py +++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py @@ -37,6 +37,60 @@ class ProfilingInfo: self.hide_op_details = False self.is_level0 = False + # 性能拆解新指标 + self.fa_time_fwd_cube = 0.0 + self.fa_num_fwd_cube = 0 + self.fa_time_bwd_cube = 0.0 + self.fa_num_bwd_cube = 0 + self.fa_time_fwd_vector = 0.0 + self.fa_num_fwd_vector = 0 + self.fa_time_bwd_vector = 0.0 + self.fa_num_bwd_vector = 0 + + self.conv_time_fwd_cube = 0.0 + self.conv_num_fwd_cube = 0 + self.conv_time_bwd_cube = 0.0 + self.conv_num_bwd_cube = 0 + self.conv_time_fwd_vector = 0.0 + self.conv_num_fwd_vector = 0 + self.conv_time_bwd_vector = 0.0 + self.conv_num_bwd_vector = 0 + + self.matmul_time_cube = 0.0 + self.matmul_num_cube = 0 + self.matmul_time_vector = 0.0 + self.matmul_num_vector = 0 + + self.vector_time_trans = 0.0 + self.vector_num_trans = 0 + self.vector_time_notrans = 0.0 + self.vector_num_notrans = 0 + + self.sdma_time_tensor_move = 0.0 + self.sdma_num_tensor_move = 0 + self.sdma_time_stream = 0.0 + self.sdma_num_stream = 0 + + @property + def e2e_time_ms(self): + return self.e2e_time * 10 ** 3 + + @property + def wait_time_ms(self): + return self.wait_time * 10 ** 3 + + @property + def transmit_time_ms(self): + return (self.communication_not_overlapped - self.wait_time) * 10 ** 3 + + @property + def other_time_ms(self): + return self.other_time * 10 ** 3 + + @property + def scheduling_time_ms(self): + return self.scheduling_time * 10 ** 3 + def trans_time_to_s(self): self.cube_time = self.cube_time / 10 ** 6 self.other_time = self.other_time / 10 ** 6 @@ -54,6 +108,22 @@ class ProfilingInfo: self.conv_time_fwd = self.conv_time_fwd / 10 ** 6 self.conv_time_bwd = self.conv_time_bwd / 10 ** 6 + # 新指标单位为ms + self.fa_time_fwd_cube /= 10 ** 3 + self.fa_time_bwd_cube /= 10 ** 3 + self.fa_time_fwd_vector /= 10 ** 3 + self.fa_time_bwd_vector /= 10 ** 3 + self.conv_time_fwd_cube /= 10 ** 3 + self.conv_time_bwd_cube /= 10 ** 3 + self.conv_time_fwd_vector /= 10 ** 3 + self.conv_time_bwd_vector /= 10 ** 3 + self.matmul_time_cube /= 10 ** 3 + self.matmul_time_vector /= 10 ** 3 + self.vector_time_trans /= 10 ** 3 + self.vector_time_notrans /= 10 ** 3 + self.sdma_time_tensor_move /= 10 ** 3 + self.sdma_time_stream /= 10 ** 3 + def calculate_other_time(self): self.other_time = max( [0, self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd - @@ -75,6 +145,30 @@ class ProfilingInfo: self.fa_time_bwd += time self.fa_num_bwd += 1 + def update_fa_fwd_cube_info(self, time: float): + self.fa_time_fwd_cube += time + self.fa_num_fwd_cube += 1 + + def update_fa_bwd_cube_info(self, time: float): + self.fa_time_bwd_cube += time + self.fa_num_bwd_cube += 1 + + def update_fa_fwd_vector_info(self, time: float): + self.fa_time_fwd_vector += time + self.fa_num_fwd_vector += 1 + + def update_fa_bwd_vector_info(self, time: float): + self.fa_time_bwd_vector += time + self.fa_num_bwd_vector += 1 + + def update_sdma_tensor_move_info(self, time: float): + self.sdma_time_tensor_move += time + self.sdma_num_tensor_move += 1 + + def update_sdma_stream_info(self, time: float, num: int = 1): + self.sdma_time_stream += time + self.sdma_num_stream += num + def update_pa_info(self, time: float): self.pa_time += time self.pa_num += 1 @@ -91,6 +185,38 @@ class ProfilingInfo: self.conv_time_bwd += time self.conv_num_bwd += 1 + def update_conv_bwd_cube_info(self, time: float): + self.conv_time_bwd_cube += time + self.conv_num_bwd_cube += 1 + + def update_conv_fwd_cube_info(self, time: float): + self.conv_time_fwd_cube += time + self.conv_num_fwd_cube += 1 + + def update_conv_bwd_vector_info(self, time: float): + self.conv_time_bwd_vector += time + self.conv_num_bwd_vector += 1 + + def update_conv_fwd_vector_info(self, time: float): + self.conv_time_fwd_vector += time + self.conv_num_fwd_vector += 1 + + def update_matmul_cube_info(self, time: float): + self.matmul_time_cube += time + self.matmul_num_cube += 1 + + def update_matmul_vector_info(self, time: float): + self.matmul_time_vector += time + self.matmul_num_vector += 1 + + def update_vector_trans_info(self, time: float): + self.vector_time_trans += time + self.vector_num_trans += 1 + + def update_vector_notrans_info(self, time: float): + self.vector_time_notrans += time + self.vector_num_notrans += 1 + def update_sdma_info(self, time: float, num: int = 1): self.sdma_time += time self.sdma_num += num diff --git a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py index 5b93d888a4..292e312815 100644 --- a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py +++ b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py @@ -8,6 +8,7 @@ from compare_backend.comparator.module_comparetor import ModuleComparator from compare_backend.comparator.module_statistic_comparator import ModuleStatisticComparator from compare_backend.comparator.operator_comparator import OperatorComparator from compare_backend.comparator.operator_statistic_comparator import OperatorStatisticComparator +from compare_backend.comparator.overall_metrics_comparator import OverallMetricsComparator from compare_backend.compare_bean.communication_bean import CommunicationBean from compare_backend.compare_bean.memory_compare_bean import MemoryCompareBean from compare_backend.compare_bean.memory_statistic_bean import MemoryStatisticBean @@ -15,6 +16,7 @@ from compare_backend.compare_bean.module_compare_bean import ModuleCompareBean from compare_backend.compare_bean.module_statistic_bean import ModuleStatisticBean from compare_backend.compare_bean.operator_compare_bean import OperatorCompareBean from compare_backend.compare_bean.operator_statistic_bean import OperatorStatisticBean +from compare_backend.compare_bean.overall_metrics_bean import OverallMetricsBean from compare_backend.data_prepare.module_data_prepare import ModuleDataPrepare from compare_backend.data_prepare.operator_data_prepare import OperatorDataPrepare from compare_backend.generator.base_generator import BaseGenerator @@ -41,8 +43,16 @@ class DetailPerformanceGenerator(BaseGenerator): self._args.enable_communication_compare: print("[INFO] Start to compare performance detail data, please wait.") comparator_list = self._create_comparator() - for comparator in comparator_list: - self._result_data.update(comparator.generate_data()) + else: + comparator_list = [] + if self._args.enable_profiling_compare: + overall_data = {Constant.BASE_DATA: self._profiling_data_dict.get(Constant.BASE_DATA).overall_metrics, + Constant.COMPARISON_DATA: self._profiling_data_dict.get( + Constant.COMPARISON_DATA).overall_metrics} + # overall 数据在最前面 + comparator_list.insert(0, OverallMetricsComparator(overall_data, OverallMetricsBean)) + for comparator in comparator_list: + self._result_data.update(comparator.generate_data()) def generate_view(self): if not self._result_data: @@ -57,6 +67,7 @@ class DetailPerformanceGenerator(BaseGenerator): comparator_list = [] op_compare_result = [] + if self._args.enable_operator_compare: module_compare_result = self.match_nn_module() if self._profiling_data_dict.get( Constant.BASE_DATA).python_function_data and self._profiling_data_dict.get( diff --git a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py index 2127ff5e75..d4194fd880 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py @@ -2,6 +2,7 @@ from abc import abstractmethod, ABC from decimal import Decimal from compare_backend.compare_bean.origin_data_bean.compare_event import KernelEvent, MemoryEvent +from compare_backend.compare_bean.origin_data_bean.kernel_details_bean import KernelDetailsBean from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean from compare_backend.compare_bean.profiling_info import ProfilingInfo from compare_backend.utils.constant import Constant @@ -66,6 +67,76 @@ class BaseProfilingParser(ABC): self._comm_list = [] self._read_trace_event() self._cur_func_index = 0 + self._categorize_performance_index = 0 + + self._cpu_op = None + + @property + def cpu_op(self): + if self._cpu_op: + return self._cpu_op + cpu_op = [op for op in self._result_data.torch_op_data if op.is_cpu_op()] + cpu_op.sort(key=lambda x: x.start_time) + self._cpu_op = cpu_op + return self._cpu_op + + def _get_flow_time_dict(self): + return { + flow_event["end"].start_time: flow_event["start"].start_time + for flow_event in self._flow_dict.values() + if flow_event.get("end") and flow_event.get("start") + } + + def new_categorize_performance_data(self, tk: (TraceEventBean, KernelDetailsBean), flow_dict_new: dict): + flow_start_time = flow_dict_new.get(tk.start_time) + if not flow_start_time: + return + while self._categorize_performance_index < len(self.cpu_op): + cur_op = self.cpu_op[self._categorize_performance_index] + if cur_op.end_time < flow_start_time: + self._categorize_performance_index += 1 + continue + if cur_op.start_time < flow_start_time: + self._new_categorize_performance_data(cur_op, tk) + elif tk.is_vector_for_categorize_performance(): + if tk.is_trans(): + self._result_data.overall_metrics.update_vector_trans_info(tk.dur) + else: + self._result_data.overall_metrics.update_vector_notrans_info(tk.dur) + return + + def _new_categorize_performance_data(self, cpu_op: TraceEventBean, tk: (TraceEventBean, KernelDetailsBean)): + """ + 判断fa/conv/matmul/vector使用cpu_op + 判断反向还是正向用ek + """ + if cpu_op.is_fa_for_cpu_op(): + if cpu_op.is_bwd_for_cpu_op(): + if tk.is_cube_categorize_performance(): + self._result_data.overall_metrics.update_fa_bwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_fa_bwd_vector_info(tk.dur) + else: + if tk.is_cube_categorize_performance(): + self._result_data.overall_metrics.update_fa_fwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_fa_fwd_vector_info(tk.dur) + elif cpu_op.is_conv_for_cpu_op(): + if cpu_op.is_bwd_for_cpu_op(): + if tk.is_cube_categorize_performance(): + self._result_data.overall_metrics.update_conv_bwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_conv_bwd_vector_info(tk.dur) + else: + if tk.is_cube_categorize_performance(): + self._result_data.overall_metrics.update_conv_fwd_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_conv_fwd_vector_info(tk.dur) + elif cpu_op.is_matmul_for_cpu_op(): # matmul + if tk.is_cube_categorize_performance(): + self._result_data.overall_metrics.update_matmul_cube_info(tk.dur) + else: + self._result_data.overall_metrics.update_matmul_vector_info(tk.dur) @abstractmethod def _update_memory_list(self): diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py index c4089aec9b..4a613f6ef0 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py @@ -30,10 +30,7 @@ class GPUProfilingParser(BaseProfilingParser): @classmethod def __is_sdma_time(cls, name: str): - for mark in cls.SDMA_MARK_LIST: - if mark in name.lower(): - return True - return False + return any(mask in name.lower() for mask in cls.SDMA_MARK_LIST) def _update_memory_list(self): if not self._enable_memory_compare: @@ -68,19 +65,15 @@ class GPUProfilingParser(BaseProfilingParser): min_ts = sys.float_info.max max_ts = sys.float_info.min self._trace_events.sort(key=lambda x: x.start_time) - aten_events = list(filter(lambda x: x.name.startswith("aten::"), self._trace_events)) - flow_dict_new = {} - for flow_event in self._flow_dict.values(): - start_event = flow_event.get("start") - end_event = flow_event.get("end") - if start_event and end_event: - flow_dict_new[end_event.start_time] = start_event.start_time + aten_events = [event for event in self._trace_events if event.name.startswith("aten::")] + flow_dict_new = self._get_flow_time_dict() for event in self._trace_events: if event.stream: min_ts = min(event.start_time, min_ts) max_ts = max(event.end_time, max_ts) if event.stream == self._compute_stream_id and self.__is_sdma_time(event.name): self._result_data.overall_metrics.update_sdma_info(event.dur) + self._result_data.overall_metrics.update_sdma_stream_info(event.dur) continue if not event.is_kernel_cat(): continue @@ -88,6 +81,7 @@ class GPUProfilingParser(BaseProfilingParser): if event.is_nccl_name(): continue self.__add_compute_time(event, aten_events, flow_dict_new) + self.new_categorize_performance_data(event, flow_dict_new) self._aten_events = None self._result_data.overall_metrics.set_e2e_time(float(max_ts - min_ts)) self.__add_compute_and_overlap_time() diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py index 70ce44b44e..fca3e3c940 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py @@ -244,17 +244,7 @@ class NPUProfilingParser(BaseProfilingParser): self._result_data.overall_metrics.update_lccl_info(event.dur) def __parse_kernel_csv(self): - try: - kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) - except Exception: - print('[WARNING] Npu kernel details csv file is not available.') - return - if not kernel_details or kernel_details[0].is_hide_op_pmu(): - self._result_data.overall_metrics.hide_op_details = True - return - for kernel in kernel_details: - if kernel.is_invalid(): - continue + def __screen_data(kernel: KernelDetailsBean): if kernel.is_flash_attention(): if kernel.is_fa_bwd(): self._result_data.overall_metrics.update_fa_bwd_info(kernel.duration) @@ -276,6 +266,21 @@ class NPUProfilingParser(BaseProfilingParser): else: self._result_data.overall_metrics.update_cube_info(kernel.duration) + try: + kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) + except Exception: + print('[WARNING] Npu kernel details csv file is not available.') + return + if not kernel_details or kernel_details[0].is_hide_op_pmu(): + self._result_data.overall_metrics.hide_op_details = True + return + flow_dict_new = self._get_flow_time_dict() + for kernel in kernel_details: + if kernel.is_invalid(): + continue + __screen_data(kernel) + self.new_categorize_performance_data(kernel, flow_dict_new) + def __parse_mem_csv(self): try: memory_record = FileReader.read_csv_file(self._memory_record_path, MemoryRecordBean) @@ -321,3 +326,4 @@ class NPUProfilingParser(BaseProfilingParser): for stream in compute_stream: dur_list = sdma_dict.get(stream, []) self._result_data.overall_metrics.update_sdma_info(sum(dur_list), len(dur_list)) + self._result_data.overall_metrics.update_sdma_stream_info(sum(dur_list), len(dur_list)) diff --git a/profiler/compare_tools/compare_backend/utils/constant.py b/profiler/compare_tools/compare_backend/utils/constant.py index 1b77b214c8..c19a9e792e 100644 --- a/profiler/compare_tools/compare_backend/utils/constant.py +++ b/profiler/compare_tools/compare_backend/utils/constant.py @@ -11,6 +11,7 @@ class Constant(object): GREEN_COLOR = "00FF00" RED_COLOR = "FF0000" BLUE_COLOR = "00BFFF" + LIGHT_BLUE_COLOR = "87CEFA" US_TO_MS = 1000 KB_TO_MB = 1024 INVALID_VALUE = -1 @@ -55,6 +56,7 @@ class Constant(object): PERFORMANCE_TABLE = "Model Profiling Time Distribution" MODULE_TABLE = "ModuleCompare" MODULE_TOP_TABLE = "ModuleCompareStatistic" + OVERALL_METRICS_TABLE = "OverallMetrics" # memory SIZE = "Size(KB)" @@ -74,7 +76,14 @@ class Constant(object): MEMORY_LIST = "memory_list" COMMUNICATION_DICT = "comm_dict" - #compare type + # compare type OVERALL_COMPARE = "overall" BWD_LIST = ["bwd", "backward", "back"] + + CPU_OP_FA_MASK = ("flash", "attention", "fusion") + CPU_OP_CONV_MASK = ("aten::conv") + CPU_OP_MATMUL_MASK = ("aten::addmm", "aten::bmm", "aten::mm", "aten::matmul") + CPU_OP_TRANS_MASK = ("cast", "transdata", "transpose") + KERNEL_CUBE_MASK = ("gemm", "conv", "cutlass", "wgrad") + KERNEL_TRANS_MASK = ("cast", "transdata", "transpose") diff --git a/profiler/compare_tools/compare_backend/utils/excel_config.py b/profiler/compare_tools/compare_backend/utils/excel_config.py index 306abcdfec..9a3e5286ff 100644 --- a/profiler/compare_tools/compare_backend/utils/excel_config.py +++ b/profiler/compare_tools/compare_backend/utils/excel_config.py @@ -18,6 +18,8 @@ class CellFormatType: 'valign': 'vcenter', 'bold': True, 'border': True} # 绿色背景,加粗 YELLOW_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.YELLOW_COLOR, 'align': 'left', 'valign': 'vcenter', 'bold': True, 'border': True} # 黄色背景,加粗 + BLUE_NORMAL = {'fg_color': Constant.BLUE_COLOR} # 蓝色背景,主要用于行样式 + LIGHT_BLUE_NORMAL = {'fg_color': Constant.LIGHT_BLUE_COLOR} # 淡蓝色背景,主要用于行样式 class ExcelConfig(object): @@ -65,6 +67,10 @@ class ExcelConfig(object): MODULE_LEVEL = "Module Level" BASE_CALL_STACK = "Base Call Stack" COMPARISON_CALL_STACK = "Comparison Call Stack" + INDEX = "Index" + DURATION = "Duration(ms)" + DURATION_RATIO = "Duration Ratio" + DIFF_DUR_MS = "Diff Duration(ms)" HEADERS = { Constant.OPERATOR_TABLE: [ @@ -176,10 +182,34 @@ class ExcelConfig(object): {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} + ], + Constant.OVERALL_METRICS_TABLE: [ + {"name": INDEX, "type": CellFormatType.DEFAULT, "width": 40}, + {"name": DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DURATION_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DURATION_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DIFF_DUR_MS, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 10}, + ] } OVERHEAD = {Constant.OPERATOR_TABLE: ["B1:F1", "G1:K1"], Constant.MEMORY_TABLE: ["B1:F1", "G1:K1"], Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"], Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"], Constant.MODULE_TOP_TABLE: ["F1:I1", "J1:M1"], - Constant.MODULE_TABLE: ["E1:H1", "I1:L1"]} + Constant.MODULE_TABLE: ["E1:H1", "I1:L1"], + Constant.OVERALL_METRICS_TABLE: ["B1:D1", "E1:G1"]} + + ROW_STYLE = { + Constant.OVERALL_METRICS_TABLE: [ + CellFormatType.BLUE_NORMAL, CellFormatType.LIGHT_BLUE_NORMAL, {}, {}, {}, {}, + CellFormatType.LIGHT_BLUE_NORMAL, {}, {}, {}, {}, CellFormatType.LIGHT_BLUE_NORMAL, {}, {}, + CellFormatType.LIGHT_BLUE_NORMAL, {}, {}, CellFormatType.LIGHT_BLUE_NORMAL, + CellFormatType.LIGHT_BLUE_NORMAL, + CellFormatType.BLUE_NORMAL, {}, {}, CellFormatType.BLUE_NORMAL, {}, {}, + CellFormatType.BLUE_NORMAL + ] + } diff --git a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py index 7a33168da3..8f2f1baa7b 100644 --- a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py +++ b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py @@ -20,7 +20,10 @@ class WorkSheetCreator: return self._work_sheet = self._work_book.add_worksheet(self._sheet_name) self._write_headers() - self._write_data() + if "row_style" in self._data: + self._write_data_with_row_style() + else: + self._write_data() def _write_headers(self): base_header_format = self._work_book.add_format(CellFormatType.GREEN_BOLD) @@ -58,3 +61,26 @@ class WorkSheetCreator: cell_data = "INF" if cell_data == float('inf') else cell_data self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format) self._row_id += 1 + + def _write_data_with_row_style(self): + """ + 带行样式及缩进的sheet + Returns: + + """ + red_ratio_format = self._work_book.add_format(CellFormatType.RED_RATIO) + rows = self._data.get("rows") + row_style = self._data.get("row_style") # 行样式 + col_style = [header["type"] for header in self._data.get("headers")] # dict格式列样式 + + for data, row_style in zip(rows, row_style): + for index, cell_data in enumerate(data): + cell_format = {**col_style[index], **row_style} # 兼容python3.5 + if isinstance(cell_data, str): + cell_format["indent"] = cell_data.count("\t") # 缩进 + cell_format = self._work_book.add_format(cell_format) + if index == self._diff_ratio_index and cell_data and cell_data > 1: + cell_format = red_ratio_format + cell_data = "INF" if cell_data == float('inf') else cell_data + self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format) + self._row_id += 1 -- Gitee