diff --git a/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py b/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py index 2b6e5270f278276521b20eae225b0c004a77a2f7..336bef7dd8553eb82586d52260443a7d01e84ab0 100644 --- a/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py +++ b/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py @@ -15,11 +15,13 @@ from common_func_advisor.constant import Constant from advice_factory.advice_factory import AdviceFactory from compute_advice.npu_fused_advice import NpuFusedAdvice +from compute_advice.npu_slow_advice import NpuSlowAdvice class ComputeAdviceFactory(AdviceFactory): ADVICE_LIB = { Constant.NPU_FUSED: NpuFusedAdvice, + Constant.NPU_SLOW: NpuSlowAdvice, } def __init__(self, collection_path: str): diff --git a/profiler/advisor/advisor_backend/common_func_advisor/constant.py b/profiler/advisor/advisor_backend/common_func_advisor/constant.py index 34879db9f2c078854aab6cfe658fc46865b885df..5b11dd33dd8d33f6a447c2f0c28bbfb9ebe5416e 100644 --- a/profiler/advisor/advisor_backend/common_func_advisor/constant.py +++ b/profiler/advisor/advisor_backend/common_func_advisor/constant.py @@ -15,7 +15,98 @@ from enum import Enum +class CsvTitle: + MODEL_NAME = "Model Name" + MODEL_ID = "Model ID" + TASK_ID = "Task ID" + STREAM_ID = "Stream ID" + INFER_ID = "Infer ID" + TASK_START_TIME = "Task Start Time(us)" + TASK_WAIT_TIME = "Task Wait Time(us)" + BLOCK_DIM = "Block Dim" + MIX_BLOCK_DIM = "Mix Block Dim" + HF32_ELIGIBLE = "HF32 Eligible" + INPUT_SHAPES = "Input Shapes" + INPUT_DATA_TYPES = "Input Data Types" + INPUT_FORMATS = "Input Formats" + OUTPUT_SHAPES = "Output Shapes" + OUTPUT_DATA_TYPES = "Output Data Types" + OUTPUT_FORMATS = "Output Formats" + CONTEXT_ID = "Context ID" + AICORE_TIME = "aicore_time(us)" + AIC_TOTAL_CYCLES = "aic_total_cycles" + AIC_MAC_TIME = "aic_mac_time(us)" + AIC_MAC_RATIO = "aic_mac_ratio" + AIC_SCALAR_TIME = "aic_scalar_time(us)" + AIC_SCALAR_RATIO = "aic_scalar_ratio" + AIC_MTE1_TIME = "aic_mte1_time(us)" + AIC_MTE1_RATIO = "aic_mte1_ratio" + AIC_MTE2_TIME = "aic_mte2_time(us)" + AIC_MTE2_RATIO = "aic_mte2_ratio" + AIC_FIXPIPE_TIME = "aic_fixpipe_time(us)" + AIC_FIXPIPE_RATIO = "aic_fixpipe_ratio" + AIC_ICACHE_MISS_RATE = "aic_icache_miss_rate" + AIV_TIME = "aiv_time(us)" + AIV_TOTAL_CYCLES = "aiv_total_cycles" + AIV_VEC_TIME = "aiv_vec_time(us)" + AIV_VEC_RATIO = "aiv_vec_ratio" + AIV_SCALAR_TIME = "aiv_scalar_time(us)" + AIV_SCALAR_RATIO = "aiv_scalar_ratio" + AIV_MTE2_TIME = "aiv_mte2_time(us)" + AIV_MTE2_RATIO = "aiv_mte2_ratio" + AIV_MTE3_TIME = "aiv_mte3_time(us)" + AIV_MTE3_RATIO = "aiv_mte3_ratio" + AIV_ICACHE_MISS_RATE = "aiv_icache_miss_rate" + CUBE_UTILIZATION = "cube_utilization( %)" + TASK_DURATION_SUM = "Task Duration Sum(us)" + TASK_DURATION_MEAN = "Task Duration Mean(us)" + TASK_DURATION_STD = "Task Duration Std(us)" + TASK_DURATION_RATIO = "Task Duration Ratio(100%)" + SIZE = "size(MB)" + THROUGHPUT = "throughput(GB/s)" + COLOR = "color" + GAP = "Gap(us)" + DURATION_SUM = "Duration Sum(us)" + COUNT = "Count" + MAX_DURATION = "Max Duration(us)" + MIN_DURATION = "Min Duration(us)" + AVG_DURATION = "Avg Duration(us)" + DURATION_RATIO = "Duration Ratio" + INDEX = "Index" + + +# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配旧版csv +class CsvTitleV1(CsvTitle): + OP_NAME = "Op Name" + OP_TYPE = "OP Type" + TASK_TYPE = "Task Type" + TASK_DURATION = "Task Duration(us)" + + +# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配新版csv +class CsvTitleV2(CsvTitle): + OP_NAME = "Name" + OP_TYPE = "Type" + TASK_TYPE = "Accelerator Core" + TASK_DURATION = "Duration(us)" + + class Constant: + DTYPE_SIZE_MAP = {"int8": 1, "uint8": 1, + "int16": 2, "uint16": 2, + "int32": 4, "uint32": 4, + "int64": 8, "uint64": 8, + "float16": 2, + "bfloat16": 2, + "bf16": 2, + "dt_bf16": 2, + "float32": 4, + "float": 4, + "float64": 8, + "complex64": 8, + "complex128": 16, + "bool": 1} + TP_THRESHOLD = 1150 MAX_INPUT_MODE_LEN = 30 MAX_INPUT_ADVICE_LEN = 30 SMALL_OP_DUR_RATIO = 0.2 @@ -35,6 +126,7 @@ class Constant: # compute NPU_FUSED = "npu_fused" + NPU_SLOW = "npu_slow" # timeline OPTIM = "optimizer" @@ -108,3 +200,24 @@ class Constant: ("Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast"): "torch_npu.npu_scaled_masked_softmax", ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul"): "torch_npu.npu_rotary_mul", ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "torch_npu.npu_rms_norm"} + TITLE = CsvTitleV2 + + @classmethod + def update_title(cls): + cls.TITLE = CsvTitleV1 + + +class CoreType: + AIV = "AI_VECTOR_CORE" + AIC = "AI_CORE" + AICPU = "AI_CPU" + MIX_AIV = "MIX_AIV" + MIX_AIC = "MIX_AIC" + HCCL = "HCCL" + + +class PerfColor(Enum): + WHITE = 0 + GREEN = 1 + YELLOW = 2 + RED = 3 diff --git a/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py b/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py index 08ef02876561001b9721e365c9aa6934057674de..0ad70880d42e738caf0fb0aeac8db6978b9099c5 100644 --- a/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py +++ b/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os from abc import abstractmethod from dataclasses import dataclass from dataclasses import field from typing import Dict from typing import List +import pandas as pd + from common_func.file_manager import FileManager @@ -89,10 +91,61 @@ class TraceViewJson: self.cann_dur_events: Dict[str, DurationEvent] = dict() self.ascend_hardware_dur_events: Dict[str, DurationEvent] = dict() self.torch_2_npu_flow_events: Dict[str, FlowEvent] = dict() - - traces = FileManager.read_json_file(path) + self.path = path + self.trace_view_path = path + if path.endswith("ascend_pt"): + self.trace_view_path = os.path.join(path, "ASCEND_PROFILER_OUTPUT", "trace_view.json") + traces = FileManager.read_json_file(self.trace_view_path) self._load_obj(traces) + def check_call_stack(self): + profiler_info_json_path = "" + for file in os.listdir(self.path): + if file.startswith("profiler_info"): + profiler_info_json_path = os.path.join(self.path, file) + break + if not profiler_info_json_path: + return False + if not os.path.exists(profiler_info_json_path) or not os.path.exists(self.trace_view_path): + return False + info = FileManager.read_json_file(profiler_info_json_path) + if not info.get("config") or not info.get("config").get("common_config") \ + or not info.get("config").get("common_config").get("with_stack"): + return False + activities = info.get("config").get("common_config").get("activities") + if not activities or "ProfilerActivity.CPU" not in activities: + return False + return info.get("config").get("common_config").get("with_stack") + + def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str: + if not self.check_call_stack(): + print("There is no call stack info, please set 'with_stack=True'") + return "" + if ts_col not in data.columns.tolist(): + print("[ERROR] No {} col found in data columns.".format(ts_col)) + return "" + row = data.loc[index_id] + timestamp = row[ts_col] + flow_event = self.get_torch_2_npu_flow_event(timestamp) + if not flow_event.valid(): + print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern'])) + return "" + flow_event_s_key = flow_event.s_point_ts + python_dur_events = self.get_python_dur_events_contain_ts(flow_event_s_key) + if not python_dur_events: + print("[ERROR] No python dur event found for pattern {}.".format(row['pattern'])) + return "" + # 保持新老版本callstack兼容性 + if python_dur_events[0].args.get("Call stack"): + # 旧版本 + call_stack = python_dur_events[0].args.get("Call stack").split(";") + else: + python_dur_events.sort(key=lambda e: e.ts) + # 新版本 + call_stack = [event.name for event in python_dur_events if event.cat == "python_function"] + call_stack = "\n".join(call_stack) + return call_stack + def get_torch_2_npu_flow_event(self, end_time) -> FlowEvent: if not self.torch_2_npu_flow_events or not self.torch_2_npu_flow_events.get(end_time): print("[ERROR] Find flow event failed for ts: {}".format(end_time)) diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py index 5411610a7f4229c6f01c04e352d380f3a2864784..c85c14d618ceda199c9c376abc27a3581eed97b8 100644 --- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py +++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py @@ -28,18 +28,10 @@ class CSVAnalyzer: def process(self): df = pd.read_csv(self._path, dtype={"Start Time(us)": str}) - - - pool = multiprocessing.Pool(multiprocessing.cpu_count()) - # 数据预解析 - result = pool.map(self.update_op_row, df.iterrows()) - pool.close() - - preparse_df = pd.DataFrame(result) # 分析是否存在可融合的算子 - op_type_list = preparse_df["Type"].tolist() - duration_list = preparse_df["Duration(us)"].tolist() - start_times = preparse_df["Start Time(us)"].tolist() + op_type_list = df["Type"].tolist() + duration_list = df["Duration(us)"].tolist() + start_times = df["Start Time(us)"].tolist() # 去除末尾的\t分隔符 start_times = [start_time[:-1] for start_time in start_times] result_list = [] @@ -50,10 +42,6 @@ class CSVAnalyzer: "index", "first_timestamp"] return data_frame - @staticmethod - def update_op_row(row): - return OpPerfFactory.build(row[1]).update() - @staticmethod def find_all_sub_lists(op_type_list, duration_list, start_times, expect_sub_list): # 创建一个空字典,用来存储子列表和它们的出现次数和起始位置 diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py index 2442807fd10b7942177990d2283ad34c369659bd..b97ce6dd7356e8d9ae2c54a23c94b47ee30424c1 100644 --- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py +++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py @@ -12,19 +12,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import functools from typing import Dict + from common_func_advisor.constant import Constant +from common_func_advisor.constant import CoreType +from common_func_advisor.constant import PerfColor class OpPerfFactory: @classmethod def build(cls, op_row: Dict): - return OpPerf(op_row) + if op_row.get(Constant.TITLE.TASK_TYPE) in (CoreType.AIV, CoreType.MIX_AIV): + return VecOpPerf(op_row) + elif op_row.get(Constant.TITLE.TASK_TYPE) in (CoreType.AIC, CoreType.MIX_AIC): + return CubeOpPerf(op_row) + else: + return OpPerf(op_row) class OpPerf: def __init__(self, op_row: Dict): + if "OP Type" in op_row.keys(): + Constant.update_title() self.row = op_row self.model_name = op_row.get("Model Name") self.model_id = op_row.get("Model ID") @@ -75,6 +85,116 @@ class OpPerf: self.aiv_mte3_ratio = op_row.get("aiv_mte3_ratio") self.aiv_icache_miss_rate = op_row.get("aiv_icache_miss_rate") self.cube_utilization = op_row.get("cube_utilization( %)") + + @staticmethod + def get_dtype_size(dtype_str: str): + return Constant.DTYPE_SIZE_MAP.get(dtype_str.lower(), 0) + + @staticmethod + def get_element_count(shape: list): + return functools.reduce(lambda x, y: int(x) * int(y), shape) + + @staticmethod + def shape_to_tuple(shape_str: str) -> tuple: + if not isinstance(shape_str, str): + return [] + shape_str = shape_str.strip('"') + split_shape = shape_str.strip(';') + if not split_shape: + return [] + pairs = split_shape.split(';') + shape_result = [] + for pair in pairs: + pair = pair.strip(";") + elements = pair.split(',') + elements = tuple(int(element) if "" != element else 0 for element in elements) + shape_result.append(elements) + return tuple(shape_result) + + @staticmethod + def dtype_to_tuple(dtypes_str: str) -> tuple: + if not isinstance(dtypes_str, str): + return [] + dtypes_str = dtypes_str.strip('"') + split_dtypes = dtypes_str.strip(';') + if not split_dtypes: + return [] + pairs = split_dtypes.split(';') + return tuple(pairs) + + def get_mac_ratio(self): + return self.aic_mac_ratio + + def get_size(self, shapes_str, dtypes_str): + shapes = self.shape_to_tuple(shapes_str) + dtypes = self.dtype_to_tuple(dtypes_str) + if len(shapes) > len(dtypes): + print(f"[ERROR] The size of shape is greater than that of dtypes in {self.row.get(Constant.TITLE.OP_NAME)}.") + return 0 + if len(shapes) < len(dtypes): + shapes = list(shapes) + shapes.extend([(1,)] * (len(dtypes) - len(shapes))) + all_size = 0 + for index, shape in enumerate(shapes): + element_count = self.get_element_count(shape) + dtype_size = self.get_dtype_size(dtypes[index]) + all_size += element_count * dtype_size + return all_size + + def get_calc_size(self): + # input and output bytes (MB) + if not self.input_shapes or not self.output_shapes: + print("[ERROR] There is no tensor data, do not assess vector op performance.") + return 0 + intput_size = self.get_size(self.input_shapes, self.input_data_types) + output_size = self.get_size(self.output_shapes, self.output_data_types) + return (intput_size + output_size) / 1e6 + + def get_throughput(self): + # throughput(GB/s) + if not self.task_duration or abs(self.task_duration) < 1e-6: + print("[ERROR] There is no task_duration, do not assess vector op performance.") + return 0 + return self.row[Constant.TITLE.SIZE] / self.task_duration * 1e3 + + def get_perf_color(self): + return PerfColor.WHITE def update(self): - return self.row + try: + self.row[Constant.TITLE.SIZE] = self.get_calc_size() + self.row[Constant.TITLE.THROUGHPUT] = self.get_throughput() + self.row[Constant.TITLE.COLOR] = self.get_perf_color().name + except Exception as e: + print(f"[ERROR] Analysis failed: {self.row.get(Constant.TITLE.OP_NAME)}, ", e, flush=True) + finally: + return self.row + + +class VecOpPerf(OpPerf): + def get_perf_color(self) -> PerfColor: + throughput = self.row[Constant.TITLE.THROUGHPUT] + op_duration = self.task_duration + tp_threshold = Constant.TP_THRESHOLD + if throughput == 0: + return PerfColor.WHITE + if throughput < tp_threshold / 2 and op_duration > 20: + return PerfColor.RED + elif tp_threshold /2 <= throughput < tp_threshold: + return PerfColor.YELLOW + else: + return PerfColor.GREEN + + +class CubeOpPerf(OpPerf): + def get_perf_color(self) -> PerfColor: + aic_mac_ratio = self.get_mac_ratio() + if not aic_mac_ratio: + print("[WARN] There is no aic_mac_ratio, do not assess cube op performance.") + return PerfColor.WHITE + elif aic_mac_ratio < 0.6: + return PerfColor.RED + elif 0.6 <= aic_mac_ratio < 0.8: + return PerfColor.YELLOW + else: + return PerfColor.GREEN diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py b/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..c688c9314629d641a516a9e9e148ddbc21f697b7 --- /dev/null +++ b/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py @@ -0,0 +1,78 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC +import multiprocessing + +import pandas as pd + +from compute_advice.compute_advice_base import ComputeAdviceBase +from compute_advice.npu_fused.op_perf import OpPerfFactory +from common_func_advisor.constant import Constant +from common_func_advisor.constant import PerfColor + + +class NpuSlowAdvice(ComputeAdviceBase, ABC): + OP_PERF_SHEET = "op_perf" + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.kernel_details_path = "" + self.data = None + + @staticmethod + def save_to_excel(data: pd.DataFrame, file_path: str) -> None: + writer = pd.ExcelWriter(file_path, engine="xlsxwriter", mode="w") + data.index.name = Constant.TITLE.INDEX + data.to_excel(writer, index=True, sheet_name=NpuSlowAdvice.OP_PERF_SHEET) + NpuSlowAdvice.color_sheet(data, writer.book, writer.sheets[NpuSlowAdvice.OP_PERF_SHEET]) + writer.sheets[NpuSlowAdvice.OP_PERF_SHEET].freeze_panes = "A2" + writer.close() + + @staticmethod + def color_sheet(data: pd.DataFrame, workbook, worksheet): + color_rgb = { + PerfColor.GREEN.name: workbook.add_format({'bg_color': '#C6EFCE'}), + PerfColor.YELLOW.name: workbook.add_format({'bg_color': '#FFEB9C'}), + PerfColor.RED.name: workbook.add_format({'bg_color': '#FFC7CE'}), + } + for row in data.iterrows(): + color = row[1][Constant.TITLE.COLOR] + fill_format = color_rgb.get(color) + if not fill_format: + continue + worksheet.set_row(row[0] + 1, None, fill_format) + + @staticmethod + def extract_slow_op(data: pd.DataFrame): + return data[data[Constant.TITLE.COLOR] == PerfColor.RED.name] + + @staticmethod + def update_op_row(row: tuple): + return OpPerfFactory.build(row[1]).update() + + def run(self): + if not self.path_check(): + return self.data + self.process() + return self.data + + def process(self): + self.data = pd.read_csv(self.kernel_details_path, dtype={"Start Time(us)": str}) + # 去除末尾的\t分隔符 + self.data["Start Time(us)"] = self.data["Start Time(us)"].apply(lambda x: x[:-1]) + pool = multiprocessing.Pool(multiprocessing.cpu_count()) + result = pool.map(self.update_op_row, self.data.iterrows()) + pool.close() + self.data = pd.DataFrame(result) diff --git a/profiler/advisor/compute_perf_analysis.ipynb b/profiler/advisor/compute_perf_analysis.ipynb index 27c9caf37bf43871f319a9418294953f54f9cafd..b5dee2933833180a364dd10643e330d5edb89d5d 100644 --- a/profiler/advisor/compute_perf_analysis.ipynb +++ b/profiler/advisor/compute_perf_analysis.ipynb @@ -75,21 +75,41 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ + "# 异常性能算子识别\n", + "from advisor_backend.compute_advice.npu_slow_advice import NpuSlowAdvice\n", + "from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson\n", "\n", - "\n" - ], - "metadata": { - "collapsed": false - } + "data = interface.get_data('compute', 'npu_slow')\n", + "slow_op_data = NpuSlowAdvice.extract_slow_op(data)\n", + "print(slow_op_data)\n", + "NpuSlowAdvice.save_to_excel(data, file_path=compute_path + \"/slow_op.xlsx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 异常性能算子call stack\n", + "trace_json = TraceViewJson(compute_path)\n", + "call_stack = trace_json.get_call_stack(data, index_id=0, ts_col=\"Start Time(us)\")\n", + "print(\"\\ncall stack: \")\n", + "print(call_stack)" + ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -101,7 +121,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.11.5" } }, "nbformat": 4,