diff --git a/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/__init__.py b/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_analyzer.py b/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..a648fb074e9c443865e07cf44380abede307f317 --- /dev/null +++ b/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_analyzer.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + +from profiler.msprof_analyze.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.msprof_analyze.advisor.analyzer.computation.ai_core_performance.ai_core_performance_checker import \ + AICorePerformanceChecker +from profiler.msprof_analyze.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.msprof_analyze.advisor.result.result import OptimizeResult +from profiler.msprof_analyze.advisor.display.html.priority_background_color import PriorityBackgroundColor +from profiler.msprof_analyze.advisor.display.html.render import HTMLRender + +logger = logging.getLogger() + + +class AICorePerformanceAnalyzer(BaseAnalyzer): + dataset_cls_list = [ProfilingDataset] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: + super().__init__(collection_path, n_processes, **kwargs) + profiling_key = ProfilingDataset.get_key() + self.profiling_dataset = self.get_first_data_by_key(self.dataset_list, profiling_key) + self.result = OptimizeResult() + self.html_render = HTMLRender() + self.html = None + + def optimize(self, **kwargs): + add_render_list = kwargs.get("add_render_list", True) + ai_core_perf_checker = AICorePerformanceChecker() + ai_core_perf_checker.data_filter(self.profiling_dataset) + if not ai_core_perf_checker.ai_core_performance_issues: + return self.result + ai_core_perf_checker.check_ai_core_performance(self.profiling_dataset) + ai_core_perf_checker.make_record(self.result) + self.html = ai_core_perf_checker.make_render(self.html_render, + add_render_list, + priority=self.get_priority(), + rank=kwargs.get("rank")) + return self.result + + def get_priority(self, max_mem_op_dur=None): + return PriorityBackgroundColor.low \ No newline at end of file diff --git a/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_checker.py b/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..e3c3defc667726ff8b370cd6ab57e52f7cc6c5e3 --- /dev/null +++ b/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_checker.py @@ -0,0 +1,580 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +from functools import reduce + +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.result.result import OptimizeResult +from profiler.prof_common.additional_args_manager import AdditionalArgsManager +from profiler.prof_common.file_manager import FileManager + +logger = logging.getLogger() + + +class AICorePerformanceChecker: + """ + operator performance checker + """ + _CHECKER = "AICorePerformanceChecker" + CUBE_OPERATOR_MEMORY_SIZE_MB = 100 + INNER_AXIS_256 = 256 + INNER_AXIS_128 = 128 + + def __init__(self): + + self.result = dict() + self.ai_core_performance_issues = False + self.desc = "" + self.cube_dict = {} + self.fa_dict = {} + self.fa_list = [] + self.vector_dict = {} + self.load_aicore_perf_rules() + + def load_aicore_perf_rules(self): + language = AdditionalArgsManager().language + rule_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))), + "rules", + language, + "aicore_performance.yaml" + ) + + if not os.path.exists(rule_path): + logger.warning("Skip analyze aicpu issues, because %s does not exist.", rule_path) + + self.language = language + self.aicore_rules = FileManager.read_yaml_file(rule_path) + self._CUBE_PROBLEM = self.aicore_rules.get("cube_problem") + self._FA_PROBLEM = self.aicore_rules.get("fa_problem") + self._VECTOR_PROBLEM = self.aicore_rules.get("vector_problem") + self.desc = self.aicore_rules.get("description") + self._BOUND_DESC = self.aicore_rules.get("bound_description") + self._OPTI_DESC = self.aicore_rules.get("optimization_description") + self._AFFINITY_DESC = self.aicore_rules.get("affinity_description") + self._CUBE_AFFINITY_DESC = self.aicore_rules.get("cube_affinity_desc") + self._FA_AFFINITY_DESC_TYPE1 = self.aicore_rules.get("fa_affinity_desc_type1") + self._FA_AFFINITY_DESC_TYPE2 = self.aicore_rules.get("fa_affinity_desc_type2") + self._FA_AFFINITY_DESC_TYPE3 = self.aicore_rules.get("fa_affinity_desc_type3") + self.suggestion = self.aicore_rules.get("suggestion") + self._AFFINITY_SUGGESTION = self.aicore_rules.get("affinity_suggestion") + self._BOUND_SUGGESTION = self.aicore_rules.get("bound_suggestion") + self._OPTI_SUGGESTION = self.aicore_rules.get("optimization_suggestion") + self._OPERATOR_RULES = {"cube_operators": self.aicore_rules.get("cube_operators"), + "fa_operators": self.aicore_rules.get("fa_operators"), + "vector_operators": self.aicore_rules.get("vector_operators")} + + def data_filter(self, profiling_dataset: ProfilingDataset): + if not self.check_task_list(profiling_dataset): + return + + operator_list = profiling_dataset.op_summary.op_list + total_duration = sum(float(operator.task_duration) for operator in operator_list) + cube_memory_dict = {} + vector_type_dict = {} + + for op in operator_list: + shapes = op.input_shapes[1:-1] + "-" + op.output_shapes[1:-1] + # preliminary filter cube operator + if op.task_type == "AI_CORE" and "matmul" in op.op_type.lower(): + cube_memory_dict.setdefault(op.op_name, {}).setdefault(shapes, 0) + cube_memory_dict[op.op_name][shapes] += self.memory_size(op) + continue + + # preliminary filter vector operator + if op.task_type in ["AI_VECTOR_CORE", "MIX_AIV"]: + vector_type_dict.setdefault(op.op_type, set()).add(op) + continue + + # filter fa operator + if op.op_type == "FlashAttentionScore": + self.fa_dict.setdefault(op.op_name, set()).add(shapes) + self.fa_list.append(op) + elif op.op_type == "FlashAttentionScoreGrad": + self.fa_dict.setdefault(op.op_name, set()).add(shapes + "-grad") + self.fa_list.append(op) + + # filter cube operator + for op_name in cube_memory_dict: + for shapes in cube_memory_dict[op_name]: + if cube_memory_dict[op_name][shapes] >= self.CUBE_OPERATOR_MEMORY_SIZE_MB: + self.cube_dict.setdefault(op_name, set()).add(shapes) + + # filter vector operator + for op_type in vector_type_dict: + duration_group_by_time = sum(float(op.task_duration) for op in vector_type_dict[op_type]) + if (duration_group_by_time / total_duration) >= 0.01 or duration_group_by_time >= 1000000: + for op in vector_type_dict[op_type]: + shapes = op.input_shapes[1:-1] + "-" + op.output_shapes[1:-1] + self.vector_dict.setdefault(op.op_name, set()).add(shapes) + + if any([self.cube_dict, self.fa_dict, self.vector_dict]): + self.ai_core_performance_issues = True + + @staticmethod + def memory_size(operator): + memory = 0 + input_shapes = operator.input_shapes[1:-1].split(";") + output_shapes = operator.output_shapes[1:-1] + for shapes in input_shapes: + if not "," in shapes and shapes != "": + # 多的一维是 bias ,预先乘2 + memory += int(shapes) * 2 + continue + memory += reduce(lambda x, y: x * y, map(int, shapes.split(","))) + memory += reduce(lambda x, y: x * y, map(int, output_shapes.split(","))) + + return memory * 2 / 1024 / 1024 + + def check_ai_core_performance(self, promoting_dataset: ProfilingDataset): + """ + :Param profiling_dataset: dataset of operator performance from kernel_details.csv + """ + try: + self.result["cube"] = self.check_cube_operator(promoting_dataset) + except (IndexError, ValueError, AttributeError) as e: + logger.error(f"Failed to check ai core performance cube operator, {e}.") + self.result["cube"] = [] + + try: + self.result["fa"] = self.check_fa_operator(promoting_dataset) + except (IndexError, ValueError, AttributeError) as e: + logger.error(f"Failed to check ai core performance fa operator, {e}.") + self.result["fa"] = [] + + try: + self.result["vector"] = self.check_vector_operator(promoting_dataset) + except (IndexError, ValueError, AttributeError) as e: + logger.error(f"Failed to check ai core performance vector operator, {e}.") + self.result["vector"] = [] + + if not any([self.result["cube"], self.result["fa"], self.result["vector"]]): + self.ai_core_performance_issues = False + + def check_cube_operator(self, profiling_dataset: ProfilingDataset): + cube_dict = self.cube_dict + suggestion = self._CUBE_AFFINITY_DESC + optimization_queue = [] + bound_queue = [] + affinity_queue = [] + operator_list = self._get_operator_list(cube_dict, profiling_dataset) + for op in cube_dict: + for shape in cube_dict[op]: + dtype = None + shape_duration = 0. + affinity_flag = self._check_cube_inner_axis(shape) + if not affinity_flag: + for operator in operator_list: + if (operator.op_name == op and + operator.input_shapes[1:-1] + "-" + operator.output_shapes[1:-1] == shape): + dtype = operator.input_data_types + shape_duration += float(operator.task_duration) + affinity_queue.append({ + "op_name": op, + "shape": shape.split("-")[0], + "dtype": dtype, + "duration": shape_duration, + "suggestion": suggestion}) + else: + shap_list = [operator for operator in operator_list if + operator.op_name == op and + operator.input_shapes[1:-1] + "-" + operator.output_shapes[1:-1] == shape] + shape_duration = sum(float(operator.task_duration) for operator in shap_list) + dtype = shap_list[0].input_data_types if shap_list else None + aic_mac_ratio, aic_mte2_ratio, length = 0., 0., 0 + for operator in shap_list: + try: + aic_mac_ratio += float(operator.aic_mac_ratio) + aic_mte2_ratio += float(operator.aic_mte2_ratio) + length += 1 + except ValueError: + continue + aic_mac_ratio = self.safe_divide(aic_mac_ratio, length) + aic_mte2_ratio = self.safe_divide(aic_mte2_ratio, length) + if aic_mac_ratio is None or aic_mte2_ratio is None: + continue + bound = "" + optimization = 0. + aic_mac_ratio_rule, aic_mte2_ratio_rule = None, None + for operator_rule in self._OPERATOR_RULES["cube_operators"]: + if operator_rule["target"] == "aic_mac_ratio": + aic_mac_ratio_rule = operator_rule + elif operator_rule["target"] == "aic_mte2_ratio": + aic_mte2_ratio_rule = operator_rule + if (aic_mac_ratio >= aic_mac_ratio_rule["threshold"] + and aic_mte2_ratio >= aic_mte2_ratio_rule["threshold"]): + bound = aic_mac_ratio_rule["bound"] + "_and_" + aic_mte2_ratio_rule["bound"] + "_bound" + elif aic_mac_ratio >= aic_mte2_ratio_rule["threshold"]: + bound = aic_mac_ratio_rule["bound"] + elif aic_mte2_ratio >= aic_mte2_ratio_rule["threshold"]: + bound = aic_mte2_ratio_rule["bound"] + else: + optimization = max(aic_mac_ratio_rule["threshold"] - aic_mac_ratio, + aic_mte2_ratio_rule["threshold"] - aic_mte2_ratio) + if bound: + bound_queue.append({ + "op_name": op, + "shape": shape.split("-")[0], + "dtype": dtype, + "bound": bound, + "duration": shape_duration}) + else: + optimization_queue.append({ + "op_name": op, + "shape": shape.split("-")[0], + "dtype": dtype, + "optimization": round(optimization * 100, 2)}) + return [sorted(optimization_queue, key=lambda x: x["optimization"], reverse=True)[:5], + sorted(bound_queue, key=lambda x: x["duration"], reverse=True)[:5], + sorted(affinity_queue, key=lambda x: x["duration"], reverse=True)[:5]] + + @staticmethod + def _get_operator_list(cube_dict, profiling_dataset): + operator_list = [] + for op in profiling_dataset.op_summary.op_list: + if op.op_name in cube_dict: + key = op.input_shapes[1:-1] + "-" + op.output_shapes[1:-1] + if key in cube_dict[op.op_name]: + operator_list.append(op) + return operator_list + + def _check_cube_inner_axis(self, shape): + # 判断输入shape内轴是否为256的倍数 + if (len(shape.split("-")[0].split(";")[0].split(","))) == 4: + # NZ格式 + shapes = shape.split("-")[0].split(";") + b = int(shapes[0].split(",")[1]) + c = int(shapes[0].split(",")[2]) + + f = int(shapes[1].split(",")[1]) + g = int(shapes[1].split(",")[2]) + return (b * c % self.INNER_AXIS_256 == 0) and (f * g % self.INNER_AXIS_256 == 0) + else: + # ND格式 + shapes = shape.split("-")[0].split(";") + l = int(shapes[0].split(",")[1]) + k = int(shapes[1].split(",")[1]) + return (l % self.INNER_AXIS_256 == 0) and (k % self.INNER_AXIS_256 == 0) + + def check_fa_operator(self, profiling_dataset: ProfilingDataset): + fa_list = self.fa_list + fa_dict = self.fa_dict + optimization_queue = [] + bound_queue = [] + affinity_queue = [] + # 不亲和算子筛选 + for op in fa_dict: + for shape in fa_dict[op]: + affinity_flag, dtype, shape_duration, suggestion = self._check_fa_inner_axis(fa_list, op, shape) + if affinity_flag: + # 不亲和算子 计算耗时,加入affinity_queue + affinity_queue.append({ + "op_name": op, + "shape": shape.split("-")[0], + "dtype": dtype, + "suggestion": suggestion, + "duration": shape_duration}) + else: + # 处理bound算子和优化算子 + aiv_vec_ratio, aic_fixpipe_ratio, aic_mte2_ratio, optimization = 0., 0., 0., 0. + bound = "" + length = 0 + if len(shape.split("-")) > 2: + for operator in fa_list: + if (operator.op_name == op and + operator.input_shapes[1:-1] + "-" + + operator.output_shapes[1:-1] + "-grad" == shape): + try: + aic_fixpipe_ratio += float(operator.aic_fixpipe_ratio) + aic_mte2_ratio += float(operator.aic_mte2_ratio) + shape_duration += float(operator.task_duration) + dtype = operator.input_data_types + length += 1 + except ValueError: + continue + aic_fixpipe_ratio = self.safe_divide(aic_fixpipe_ratio, length) + aic_mte2_ratio = self.safe_divide(aic_mte2_ratio, length) + if aic_mte2_ratio is None or aic_fixpipe_ratio is None: + continue + aic_fixpipe_ratio_rule, aic_mte2_ratio_rule = None, None + for rule in self._OPERATOR_RULES["fa_operators"]: + if rule["target"] == "aic_fixpipe_ratio": + aic_fixpipe_ratio_rule = rule + elif rule["target"] == "aic_mte2_ratio": + aic_mte2_ratio_rule = rule + if (aic_mte2_ratio >= aic_mte2_ratio_rule["threshold"] and + aic_fixpipe_ratio >= aic_fixpipe_ratio_rule["threshold"]): + bound = aic_fixpipe_ratio_rule["bound"] + "_and_" + aic_mte2_ratio_rule["bound"] + "_bound" + elif aic_mte2_ratio >= aic_mte2_ratio_rule["threshold"]: + bound = aic_mte2_ratio_rule["bound"] + elif aic_fixpipe_ratio >= aic_fixpipe_ratio_rule["threshold"]: + bound = aic_fixpipe_ratio_rule["bound"] + else: + optimization = max(aic_fixpipe_ratio_rule["threshold"] - aic_fixpipe_ratio, + aic_mte2_ratio_rule["threshold"] - aic_mte2_ratio) + else: + for operator in fa_list: + if (operator.op_name == op and + operator.input_shapes[1:-1] + "-" + operator.output_shapes[1:-1] == shape): + try: + aiv_vec_ratio += float(operator.aiv_vec_ratio) + aic_mte2_ratio += float(operator.aic_mte2_ratio) + shape_duration += float(operator.task_duration) + length += 1 + except ValueError: + continue + aiv_vec_ratio = self.safe_divide(aiv_vec_ratio, length) + aic_mte2_ratio = self.safe_divide(aic_mte2_ratio, length) + if aiv_vec_ratio is None or aic_mte2_ratio is None: + continue + aiv_vec_ratio_rule, aic_mte2_ratio_rule = None, None + for rule in self._OPERATOR_RULES["fa_operators"]: + if rule["target"] == "aiv_vec_ratio": + aiv_vec_ratio_rule = rule + elif rule["target"] == "aic_mte2_ratio": + aic_mte2_ratio_rule = rule + if (aic_mte2_ratio >= aic_mte2_ratio_rule["threshold"] + and aiv_vec_ratio >= aiv_vec_ratio_rule["threshold"]): + bound = aic_mte2_ratio_rule["bound"] + "_and_" + aiv_vec_ratio_rule["bound"] + "_bound" + elif aic_mte2_ratio >= aic_mte2_ratio_rule["threshold"]: + bound = aic_mte2_ratio_rule["bound"] + elif aiv_vec_ratio >= aiv_vec_ratio_rule["threshold"]: + bound = aiv_vec_ratio_rule["bound"] + else: + optimization = max(aiv_vec_ratio_rule["threshold"] - aiv_vec_ratio, + aic_mte2_ratio_rule["threshold"] - aic_mte2_ratio) + if bound: + bound_queue.append({ + "op_name": op, + "shape": shape.split("-")[0], + "dtype": dtype, + "bound": bound, + "duration": shape_duration}) + else: + optimization_queue.append({ + "op_name": op, + "shape": shape.split("-")[0], + "dtype": dtype, + "optimization": round(optimization * 100, 2)}) + + return [sorted(optimization_queue, key=lambda x: x["optimization"], reverse=True)[:5], + sorted(bound_queue, key=lambda x: x["duration"], reverse=True)[:5], + sorted(affinity_queue, key=lambda x: x["duration"], reverse=True)[:5]] + + def _check_fa_inner_axis(self, fa_list, op, shape): + shape_duration = 0. + affinity_flag = False + dtype = None + suggestion = "" + if "varlen" in op.lower(): + # 处理变长算子 如果不亲和则affinity_flag为False + inner_axis = int(shape.split("-")[0].split(";")[0].split(",")[2]) + if inner_axis % self.INNER_AXIS_128 != 0: + affinity_flag = True + suggestion = self._FA_AFFINITY_DESC_TYPE1 + for operator in fa_list: + if (operator.op_name == op and + operator.input_shapes[1:-1] + "-" + operator.output_shapes[1:-1] == shape): + shape_duration += float(operator.task_duration) + dtype = operator.input_data_types + else: + # 处理定长算子 如果不亲和则affinity_flag为False + head_dim = 0 + seq_len = int(shape.split("-")[1].split(";")[0].split(",")[2]) + input_first_tensor = shape.split("-")[0].split(";")[0].split(",") + if len(input_first_tensor) == 3: + head_dim = int(input_first_tensor[2]) / int(shape.split("-")[1].split(";")[0].split(",")[1]) + else: + head_dim = int(input_first_tensor[3]) + if head_dim % self.INNER_AXIS_128 != 0 and seq_len % self.INNER_AXIS_128 != 0: + affinity_flag = True + suggestion = self._FA_AFFINITY_DESC_TYPE3 + elif head_dim % self.INNER_AXIS_128 != 0: + affinity_flag = True + suggestion = self._FA_AFFINITY_DESC_TYPE1 + elif seq_len % self.INNER_AXIS_128 != 0: + affinity_flag = True + suggestion = self._FA_AFFINITY_DESC_TYPE2 + if affinity_flag: + for operator in fa_list: + if (operator.op_name == op and + operator.input_shapes[1:-1] + "-" + + operator.output_shapes[1:-1] == shape): + shape_duration += float(operator.task_duration) + dtype = operator.input_data_types + return affinity_flag, dtype, shape_duration, suggestion + + def check_vector_operator(self, profiling_dataset: ProfilingDataset): + vector_dict = self.vector_dict + optimization_queue = [] + bound_queue = [] + vector_list = self._get_vector_list(profiling_dataset, vector_dict) + for op_name in vector_dict: + for shape in vector_dict[op_name]: + aiv_vec_ratio, aiv_mte2_ratio, aiv_mte3_ratio, shape_duration, optimization = 0., 0., 0., 0., 0. + length = 0 + bound, dtype = "", "" + for operator in vector_list: + if (operator.op_name == op_name and + operator.input_shapes[1:-1] + "-" + operator.output_shapes[1:-1] == shape): + try: + aiv_vec_ratio += float(operator.aiv_vec_ratio) + aiv_mte2_ratio += float(operator.aiv_mte2_ratio) + aiv_mte3_ratio += float(operator.aiv_mte3_ratio) + shape_duration += float(operator.task_duration) + dtype = operator.input_data_types + length += 1 + except ValueError: + continue + aiv_vec_ratio = self.safe_divide(aiv_vec_ratio, length) + aiv_mte2_ratio = self.safe_divide(aiv_mte2_ratio, length) + aiv_mte3_ratio = self.safe_divide(aiv_mte3_ratio, length) + if aiv_vec_ratio is None or aiv_mte2_ratio is None or aiv_mte3_ratio is None: + continue + aiv_vec_ratio_rule, aiv_mte2_ratio_rule, aiv_mte3_ratio_rule, total_rule = None, None, None, None + for operator_rule in self._OPERATOR_RULES["vector_operators"]: + if operator_rule["target"] == "aiv_vec_ratio": + aiv_vec_ratio_rule = operator_rule + elif operator_rule["target"] == "aiv_mte2_ratio": + aiv_mte2_ratio_rule = operator_rule + elif operator_rule["target"] == "aiv_mte3_ratio": + aiv_mte3_ratio_rule = operator_rule + elif operator_rule["target"] == "total": + total_rule = operator_rule + if aiv_vec_ratio + aiv_mte2_ratio + aiv_mte3_ratio >= total_rule["threshold"]: + bound = total_rule["bound"] + elif aiv_mte2_ratio >= aiv_mte2_ratio_rule["threshold"]: + bound = aiv_mte2_ratio_rule["bound"] + elif aiv_mte3_ratio >= aiv_mte3_ratio_rule["threshold"]: + bound = aiv_mte3_ratio_rule["bound"] + elif aiv_vec_ratio >= aiv_vec_ratio_rule["threshold"]: + bound = aiv_vec_ratio_rule["bound"] + else: + optimization = max(aiv_vec_ratio_rule["threshold"] - aiv_vec_ratio, + aiv_mte2_ratio_rule["threshold"] - aiv_mte2_ratio, + aiv_mte3_ratio_rule["threshold"] - aiv_mte3_ratio) + if bound: + bound_queue.append({ + "op_name": op_name, + "shape": shape.split("-")[0], + "bound": bound, + "dtype": dtype, + "duration": shape_duration}) + else: + optimization_queue.append({ + "op_name": op_name, + "shape": shape.split("-")[0], + "dtype": dtype, + "optimization": round(optimization * 100, 2)}) + return [sorted(optimization_queue, key=lambda x: x["optimization"], reverse=True)[:5], + sorted(bound_queue, key=lambda x: x["duration"], reverse=True)[:5]] + + @staticmethod + def _get_vector_list(profiling_dataset, vector_dict): + vector_list = [] + for op_name in vector_dict: + for shape in vector_dict[op_name]: + for operator in profiling_dataset.op_summary.op_list: + if operator.op_name == op_name and operator.input_shapes[1:-1] + "-" + operator.output_shapes[ + 1:-1] == shape: + vector_list.extend([operator]) + return vector_list + + def draw_record(self, op_type: str, result: OptimizeResult): + suggestion_keys = ['opti', 'bound', 'affinity'] + desc = dict.fromkeys(suggestion_keys, "") + problem_map = { + 'cube': self._CUBE_PROBLEM, + 'fa': self._FA_PROBLEM, + 'vector': self._VECTOR_PROBLEM + } + optimization_item = OptimizeItem(problem_map[op_type], self.desc, [self.suggestion]) + result.add(OptimizeRecord(optimization_item)) + headers = [ + "Type", + "Description and Suggestion", + ] + result.add_detail(problem_map[op_type], headers=headers) + for opti_issue in self.result[op_type][0]: + opti_sugg = self._OPTI_SUGGESTION.format(**opti_issue) + desc["opti"] += opti_sugg + if desc["opti"]: + result.add_detail(problem_map[op_type], detail=[self._OPTI_DESC, desc["opti"]]) + for bound_issue in self.result[op_type][1]: + bound_sugg = self._BOUND_SUGGESTION.format(**bound_issue) + desc["bound"] += bound_sugg + if desc["bound"]: + result.add_detail(problem_map[op_type], detail=[self._BOUND_DESC, desc["bound"]]) + if op_type == "vector": # vector 类型没有亲和性建议 + return + for affinity_issue in self.result[op_type][2]: + affinity_sugg = self._AFFINITY_SUGGESTION.format(**affinity_issue) + desc["affinity"] += affinity_sugg + if desc["affinity"]: + result.add_detail(problem_map[op_type], detail=[self._AFFINITY_DESC, desc["affinity"]]) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.ai_core_performance_issues: + return self.ai_core_performance_issues + + if any(self.result["cube"]): + self.draw_record("cube", result) + + if any(self.result["fa"]): + self.draw_record("fa", result) + + if any(self.result["vector"]): + self.draw_record("vector", result) + + return True + + def make_render(self, html_render, add_render_list=True, **kwargs): + if not self.ai_core_performance_issues: + return self.ai_core_performance_issues + + priority = kwargs.get("priority") + return html_render.render_template(key="computation", + template_dir="templates", + template_name="ai_core_performance.html", + format_result=self.result, + language=self.language, + add_render_list=add_render_list, + priority_background_color=priority, + rank=kwargs.get("rank")) + + def check_task_list(self, profiling_dataset: ProfilingDataset) -> bool: + if not hasattr(profiling_dataset, "op_summary"): + logger.warning("Skip %s checker because of not containing %s", self._CHECKER, "op summary") + return False + if not hasattr(profiling_dataset.op_summary, "op_list"): + logger.warning("Skip %s checker because of not containing %s", self._CHECKER, "op_list") + return False + if (not hasattr(profiling_dataset.op_summary.op_list[0], "input_shapes") or + not hasattr(profiling_dataset.op_summary.op_list[0], "input_data_types")): + logger.warning("Skip %s checker because of not containing input datas", self._CHECKER) + return False + return True + + @staticmethod + def safe_divide(numerator, denominator): + if denominator == 0: + logger.warning("Warning: Division by zero is not allowed.") + return None + return numerator / denominator diff --git a/profiler/msprof_analyze/advisor/common/analyzer_scopes.py b/profiler/msprof_analyze/advisor/common/analyzer_scopes.py index 07ceef769440b39c93aeaaf15ded5ad99fc3f4b3..683dc575d02ef3def213001e66cd47d42d88d263 100644 --- a/profiler/msprof_analyze/advisor/common/analyzer_scopes.py +++ b/profiler/msprof_analyze/advisor/common/analyzer_scopes.py @@ -41,3 +41,4 @@ class SupportedScopes: FUSIBLE_OPERATOR_ANALYSIS = "fusible_operator_analysis" CONJECTURED_GC_ANALYSIS = "conjectured_analysis" COMPARISON = "comparison" + AICORE_PERFORMANCE_ANALYSIS = "ai_core_performance_analysis" diff --git a/profiler/msprof_analyze/advisor/display/html/templates/ai_core_performance.html b/profiler/msprof_analyze/advisor/display/html/templates/ai_core_performance.html new file mode 100644 index 0000000000000000000000000000000000000000..77e5e0cb55200efdf5b854e03ac2844ddc631a8f --- /dev/null +++ b/profiler/msprof_analyze/advisor/display/html/templates/ai_core_performance.html @@ -0,0 +1,159 @@ +{% if format_result|length > 0 %} +
+

AI CORE Performance Analysis

+
+ {% if language == "cn" %} + {% set title_ns = namespace(type='类别', desc='描述及建议', opti_set='性能优化算子集合', bound_set='bound算子集合', affinity_set='不亲和算子集合', + opti_refer=' 参考性能优化空间: ', bound_refer=' bound类型为: ', affinity_refer=' 不亲和类型为: ', title_desc='算子相关分析,参考如下: ') %} + {% else %} + {% set title_ns = namespace(type='Type', desc='Description and Suggestion', opti_set='set of performance optimization operators', + bound_set='set of bound operators', affinity_set='set of unaffine operators', opti_refer=' refer to Performance Optimization Space: ', + bound_refer=' bound type: ', affinity_refer=' type of disaffinity: ', title_desc=' Operator related analysis, referenced below: ') %} + {% endif %} + {% if format_result.cube[0]|length + format_result.cube[1]|length + format_result.cube[2]|length > 0 %} + MatMul{{ title_ns.title_desc }} +
+ + + + + + {% set opti_ns = namespace(total_opti='') %} + {% for opti in format_result.cube[0] %} + {% if not loop.first %} + {% set opti_ns.total_opti = opti_ns.total_opti ~ "
" ~ opti.op_name ~ " operator shape: " ~ opti.shape ~ " dtype: " ~ opti.dtype ~ title_ns.opti_refer ~ opti.optimization ~ "%" %} + {% else %} + {% set opti_ns.total_opti = opti.op_name ~ " operator shape: " ~ opti.shape ~ " dtype: " ~ opti.dtype ~ title_ns.opti_refer ~ opti.optimization ~ "%" %} + {% endif %} + {% endfor %} + {% if opti_ns.total_opti|length > 0 %} + + + + + {% endif %} + {% set bound_ns = namespace(total_bound='') %} + {% for bound in format_result.cube[1] %} + {% if not loop.first %} + {% set bound_ns.total_bound = bound_ns.total_bound ~ "
" ~ bound.op_name ~ " operator shape: " ~ bound.shape ~ " dtype: " ~ bound.dtype ~ title_ns.bound_refer ~ bound.bound %} + {% else %} + {% set bound_ns.total_bound = bound.op_name ~ " operator shape: " ~ bound.shape ~ " dtype: " ~ bound.dtype ~ title_ns.bound_refer ~ bound.bound %} + {% endif %} + {% endfor %} + {% if bound_ns.total_bound|length > 0 %} + + + + + {% endif %} + {% set affinity_ns = namespace(total_affinity='') %} + {% for affinity in format_result.cube[2] %} + {% if not loop.first %} + {% set affinity_ns.total_affinity = affinity_ns.total_affinity ~ "
" ~ affinity.op_name ~ " operator shape: " ~ affinity.shape ~ " dtype: " ~ affinity.dtype ~ title_ns.affinity_refer ~ affinity.suggestion %} + {% else %} + {% set affinity_ns.total_affinity = affinity.op_name ~ " operator shape: " ~ affinity.shape ~ " dtype: " ~ affinity.dtype ~ title_ns.affinity_refer ~ affinity.suggestion %} + {% endif %} + {% endfor %} + {% if affinity_ns.total_affinity|length > 0 %} + + + + + {% endif %} +
{{ title_ns.type }}{{ title_ns.desc }}
{{ title_ns.opti_set }}{{ opti_ns.total_opti | safe }}
{{ title_ns.bound_set }}{{ bound_ns.total_bound | safe }}
{{ title_ns.affinity_set }}{{ affinity_ns.total_affinity | safe }}
+ {% endif %} + + {% if format_result.fa[0]|length + format_result.fa[1]|length + format_result.fa[2]|length > 0 %} + FA{{ title_ns.title_desc }} +
+ + + + + + {% set opti_ns = namespace(total_opti='') %} + {% for opti in format_result.fa[0] %} + {% if not loop.first %} + {% set opti_ns.total_opti = opti_ns.total_opti ~ "
" ~ opti.op_name ~ " operator shape: " ~ opti.shape ~ " dtype: " ~ opti.dtype ~ title_ns.opti_refer ~ opti.optimization ~ "%" %} + {% else %} + {% set opti_ns.total_opti = opti.op_name ~ " operator shape: " ~ opti.shape ~ " dtype: " ~ opti.dtype ~ title_ns.opti_refer ~ opti.optimization ~ "%" %} + {% endif %} + {% endfor %} + {% if opti_ns.total_opti|length > 0 %} + + + + + {% endif %} + {% set bound_ns = namespace(total_bound='') %} + {% for bound in format_result.fa[1] %} + {% if not loop.first %} + {% set bound_ns.total_bound = bound_ns.total_bound ~ "
" ~ bound.op_name ~ " operator shape: " ~ bound.shape ~ " dtype: " ~ bound.dtype ~ title_ns.bound_refer ~ bound.bound %} + {% else %} + {% set bound_ns.total_bound = bound.op_name ~ " operator shape: " ~ bound.shape ~ " dtype: " ~ bound.dtype ~ title_ns.bound_refer ~ bound.bound %} + {% endif %} + {% endfor %} + {% if bound_ns.total_bound|length > 0 %} + + + + + {% endif %} + {% set affinity_ns = namespace(total_affinity='') %} + {% for affinity in format_result.fa[2] %} + {% if not loop.first %} + {% set affinity_ns.total_affinity = affinity_ns.total_affinity ~ "
" ~ affinity.op_name ~ " operator shape: " ~ affinity.shape ~ " dtype: " ~ affinity.dtype ~ title_ns.affinity_refer ~ affinity.suggestion %} + {% else %} + {% set affinity_ns.total_affinity = affinity.op_name ~ " operator shape: " ~ affinity.shape ~ " dtype: " ~ affinity.dtype ~ title_ns.affinity_refer ~ affinity.suggestion %} + {% endif %} + {% endfor %} + {% if affinity_ns.total_affinity|length > 0 %} + + + + + {% endif %} +
{{ title_ns.type }}{{ title_ns.desc }}
{{ title_ns.opti_set }}{{ opti_ns.total_opti | safe }}
{{ title_ns.bound_set }}{{ bound_ns.total_bound | safe }}
{{ title_ns.affinity_set }}{{ affinity_ns.total_affinity | safe }}
+ {% endif %} + + {% if format_result.vector[0]|length + format_result.vector[1]|length > 0 %} + Vector{{ title_ns.title_desc }} +
+ + + + + + {% set opti_ns = namespace(total_opti='') %} + {% for opti in format_result.vector[0] %} + {% if not loop.first %} + {% set opti_ns.total_opti = opti_ns.total_opti ~ "
" ~ opti.op_name ~ " operator shape: " ~ opti.shape ~ " dtype: " ~ opti.dtype ~ title_ns.opti_refer ~ opti.optimization ~ "%" %} + {% else %} + {% set opti_ns.total_opti = opti.op_name ~ " operator shape: " ~ opti.shape ~ " dtype: " ~ opti.dtype ~ title_ns.opti_refer ~ opti.optimization ~ "%" %} + {% endif %} + {% endfor %} + {% if opti_ns.total_opti|length > 0 %} + + + + + {% endif %} + {% set bound_ns = namespace(total_bound='') %} + {% for bound in format_result.vector[1] %} + {% if not loop.first %} + {% set bound_ns.total_bound = bound_ns.total_bound ~ "
" ~ bound.op_name ~ " operator shape: " ~ bound.shape ~ " dtype: " ~ bound.dtype ~ title_ns.bound_refer ~ bound.bound %} + {% else %} + {% set bound_ns.total_bound = bound.op_name ~ " operator shape: " ~ bound.shape ~ " dtype: " ~ bound.dtype ~ title_ns.bound_refer ~ bound.bound %} + {% endif %} + {% endfor %} + {% if bound_ns.total_bound|length > 0 %} + + + + + {% endif %} +
{{ title_ns.type }}{{ title_ns.desc }}
{{ title_ns.opti_set }}{{ opti_ns.total_opti | safe }}
{{ title_ns.bound_set }}{{ bound_ns.total_bound | safe }}
+ {% endif %} +
+
+{% endif %} \ No newline at end of file diff --git a/profiler/msprof_analyze/advisor/interface/interface.py b/profiler/msprof_analyze/advisor/interface/interface.py index b3afefee57c8c62030af17130f79413238588f8f..cce2de62522bacc0d9bd26b3b14821409871a5f4 100644 --- a/profiler/msprof_analyze/advisor/interface/interface.py +++ b/profiler/msprof_analyze/advisor/interface/interface.py @@ -44,6 +44,8 @@ from msprof_analyze.advisor.analyzer.schedule.gc.gc_analyzer import GcAnalyzer from msprof_analyze.advisor.analyzer.schedule.conjectured_gc.conjectured_gc_analyzer import ConjecturedGcAnalyzer from msprof_analyze.advisor.analyzer.comparison.comparison_analyzer import ComparisonAnalyzer from msprof_analyze.advisor.analyzer.schedule.fusible_ops.fusible_operator_analyzer import FusibleOperatorAnalyzer +from profiler.msprof_analyze.advisor.analyzer.computation.ai_core_performance import \ + AICorePerformanceAnalyzer logger = logging.getLogger() @@ -74,7 +76,8 @@ class Interface: SupportedScopes.OPERATOR_NO_BOUND_ANALYSIS: OperatorBoundAnalyzer, SupportedScopes.BLOCK_DIM_ANALYSIS: BlockDimAnalyzer, SupportedScopes.GRAPH: FusionOPAnalyzer, - SupportedScopes.FREQ_ANALYSIS: AICoreFreqAnalyzer + SupportedScopes.FREQ_ANALYSIS: AICoreFreqAnalyzer, + SupportedScopes.AICORE_PERFORMANCE_ANALYSIS: AICorePerformanceAnalyzer }), COMMUNICATION: OrderedDict({SupportedScopes.PACKET: PacketAnalyzer, SupportedScopes.COMMUNICATION_RETRANSMISSION_DETECTION: RDMARetransmissionAnalyzer, diff --git a/profiler/msprof_analyze/advisor/rules/cn/aicore_performance.yaml b/profiler/msprof_analyze/advisor/rules/cn/aicore_performance.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3f60747b250a07d28b1aa33b9e2c05a06cac89e2 --- /dev/null +++ b/profiler/msprof_analyze/advisor/rules/cn/aicore_performance.yaml @@ -0,0 +1,48 @@ +cube_problem: "Cube算子性能分析" +fa_problem: "FA算子性能分析" +vector_problem: "Vector算子性能分析" +description: "提供一些AICORE算子的参考瓶颈" +bound_description: "bound算子集合" +optimization_description: "性能优化算子集合" +affinity_description: "不亲和算子集合" +cube_affinity_desc: "内轴无法被256整除" +fa_affinity_desc_type1: "D不能被128整除" +fa_affinity_desc_type2: "S不能被128整除" +fa_affinity_desc_type3: "D和S均不能被128整除" +suggestion: "请根据亲和性、bound类型或优化空间尝试分析筛选出来的算子" +affinity_suggestion: "{op_name}算子 shape: {shape} dtype: {dtype} 有不亲和特征: {suggestion}\n" +bound_suggestion: "{op_name}算子 shape: {shape} dtype: {dtype} bound类型为: {bound} bound\n" +optimization_suggestion: "{op_name}算子 shape: {shape} dtype: {dtype} 疑似有性能优化空间,参考性能优化空间: {optimization}%\n" + +cube_operators: + - target: aic_mac_ratio + bound: mac + threshold: 0.8 + - target: aic_mte2_ratio + bound: mte2 + threshold: 0.95 + +fa_operators: + - target: aic_mte2_ratio + bound: mac + threshold: 0.8 + - target: aic_fixpipe_ratio + bound: fixpipe + threshold: 0.75 + - target: aiv_vec_ratio + bound: vec + threshold: 0.75 + +vector_operators: + - target: total + bound: vec_mte2_mte3 + threshold: 0.9 + - target: aiv_vec_ratio + bound: vec + threshold: 0.7 + - target: aiv_mte2_ratio + bound: mte2 + threshold: 0.7 + - target: aiv_mte3_ratio + bound: mte3 + threshold: 0.7 \ No newline at end of file diff --git a/profiler/msprof_analyze/advisor/rules/en/aicore_performance.yaml b/profiler/msprof_analyze/advisor/rules/en/aicore_performance.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1e5e4701a383be1c990f2f00013c0ee3b0cd896 --- /dev/null +++ b/profiler/msprof_analyze/advisor/rules/en/aicore_performance.yaml @@ -0,0 +1,48 @@ +cube_problem: "Cube operator performance analysis" +fa_problem: "FA operator performance analysis" +vector_problem: "Vector operator performance analysis" +description: "Provide some reference bottlenecks for the AICORE operator" +bound_description: "set of bound operators" +optimization_description: "set of performance optimization operators" +affinity_description: "set of unaffine operators" +cube_affinity_desc: "Then inner axis is not divisible by 256" +fa_affinity_desc_type1: "D is not divisible by 128" +fa_affinity_desc_type2: "S is not divisible by 128" +fa_affinity_desc_type3: "Neither D nor S is not divisible by 128" +suggestion: "Please try to analyze the filtered operators based on affinity, bound type or optimization space" +affinity_suggestion: "{op_name} Op shape: {shape} dtype: {dtype} with disaffection characteristics: {suggestion}\n" +bound_suggestion: "{op_name} Op shape: {shape} dtype: {dtype} bound type: {bound} bound\n" +optimization_suggestion: "{op_name} Op shape: {shape} dtype: {dtype} suspect there is room for performance optimization, refer to Performance Optimization Space: {optimization}%\n" + +cube_operators: + - target: aic_mac_ratio + bound: mac + threshold: 0.8 + - target: aic_mte2_ratio + bound: mte2 + threshold: 0.95 + +fa_operators: + - target: aic_mte2_ratio + bound: mac + threshold: 0.8 + - target: aic_fixpipe_ratio + bound: fixpipe + threshold: 0.75 + - target: aiv_vec_ratio + bound: vec + threshold: 0.75 + +vector_operators: + - target: total + bound: vec_mte2_mte3 + threshold: 0.9 + - target: aiv_vec_ratio + bound: vec + threshold: 0.7 + - target: aiv_mte2_ratio + bound: mte2 + threshold: 0.7 + - target: aiv_mte3_ratio + bound: mte3 + threshold: 0.7 \ No newline at end of file diff --git a/profiler/msprof_analyze/cli/entrance.py b/profiler/msprof_analyze/cli/entrance.py index 534a9b133c7e60d1442cb290490a79e9256ce43d..6a185edb974ed137a34e2e82690c5286b0194b70 100644 --- a/profiler/msprof_analyze/cli/entrance.py +++ b/profiler/msprof_analyze/cli/entrance.py @@ -66,5 +66,4 @@ def msprof_analyze_cli(**kwargs): msprof_analyze_cli.add_command(analyze_cli, name="advisor") msprof_analyze_cli.add_command(compare_cli, name="compare") msprof_analyze_cli.add_command(cluster_cli, name="cluster") -msprof_analyze_cli.add_command(auto_complete_cli, name="auto-completion") - +msprof_analyze_cli.add_command(auto_complete_cli, name="auto-completion") \ No newline at end of file diff --git a/profiler/msprof_analyze/test/ut/advisor/compute_advice/data/kernel_details.csv b/profiler/msprof_analyze/test/ut/advisor/compute_advice/data/kernel_details.csv new file mode 100644 index 0000000000000000000000000000000000000000..f22cb80080b52cfc828b1b1a67ec319f6147191e --- /dev/null +++ b/profiler/msprof_analyze/test/ut/advisor/compute_advice/data/kernel_details.csv @@ -0,0 +1,30 @@ +Step Id,Model ID,Task ID,Stream ID,Name,Type,OP State,Accelerator Core,Start Time(us),Duration(us),Wait Time(us),Block Dim,Mix Block Dim,HF32 Eligible,Input Shapes,Input Data Types,Input Formats,Output Shapes,Output Data Types,Output Formats,Context ID,aicore_time(us),aic_total_cycles,aic_mac_time(us),aic_mac_ratio,aic_scalar_time(us),aic_scalar_ratio,aic_mte1_time(us),aic_mte1_ratio,aic_mte2_time(us),aic_mte2_ratio,aic_fixpipe_time(us),aic_fixpipe_ratio,aic_icache_miss_rate,aiv_time(us),aiv_total_cycles,aiv_vec_time(us),aiv_vec_ratio,aiv_scalar_time(us),aiv_scalar_ratio,aiv_mte2_time(us),aiv_mte2_ratio,aiv_mte3_time(us),aiv_mte3_ratio,aiv_icache_miss_rate,cube_utilization(%) +19,4294967295,61653,2,aclnnMatmul_MatMulCommon_MatMulV2,MatMulV2,dynamic,AI_CORE,"1736413971558972.912 ",185.504,1.087,16,0,NO,"""81920,4096;8192,512""",DT_BF16;DT_BF16,ND;ND,"""4096,512""",DT_BF16,ND,N/A,183.87,5295467,151.425,0.824,88.03,0.479,119.148,0.648,177.314,0.964,5.736,0.031,0.001,0,0,0,0,0,0,0,0,0,0,0,79.295 +19,4294967295,61669,2,aclnnMatmul_MatMulV3Common_MatMulV3,MatMulV3,dynamic,AI_CORE,"1736413971560588.764 ",501.17,2.2,20,0,NO,"""81920,1536;8192,4096""",DT_BF16;DT_BF16,ND;ND,"""1536,4096""",DT_BF16,ND,N/A,478.701,17233251,356.349,0.744,118.087,0.247,296.009,0.618,452.112,0.944,35.833,0.075,0.001,0,0,0,0,0,0,0,0,0,0,0,95.517 +19,4294967295,61694,2,aclnnMatmul_MatMulCommon_MatMulV2,MatMulV2,dynamic,AI_CORE,"1736413971565213.257 ",186.823,1.178,16,0,NO,"""81920,4096;8192,512""",DT_BF16;DT_BF16,ND;ND,"""4096,512""",DT_BF16,ND,N/A,183.728,5291376,151.502,0.825,87.902,0.478,118.519,0.645,177.654,0.967,5.773,0.031,0.001,0,0,0,0,0,0,0,0,0,0,0,78.675 +19,4294967295,61710,2,aclnnMatmul_MatMulV3Common_MatMulV3,MatMulV3,dynamic,AI_CORE,"1736413971566843.489 ",516.991,2.33,20,0,NO,"""81920,1536;8192,4096""",DT_BF16;DT_BF16,ND;ND,"""1536,4096""",DT_BF16,ND,N/A,491.775,17703905,356.249,0.724,118.59,0.241,295.046,0.6,463.696,0.943,37.671,0.077,0.001,0,0,0,0,0,0,0,0,0,0,0,95.123 +19,4294967295,61735,2,aclnnMatmul_MatMulCommon_MatMulV2,MatMulV2,dynamic,AI_CORE,"1736413971571596.404 ",187.724,0.766,16,0,NO,"""81920,4096;8192,512""",DT_BF16;DT_BF16,ND;ND,"""4096,512""",DT_BF16,ND,N/A,184.904,5325221,151.489,0.819,87.893,0.475,118.63,0.642,178.815,0.967,5.77,0.031,0.001,0,0,0,0,0,0,0,0,0,0,0,78.798 +19,4294967295,61751,2,aclnnMatmul_MatMulV3Common_MatMulV3,MatMulV3,dynamic,AI_CORE,"1736413971573223.437 ",514.87,2.15,20,0,NO,"""81920,1536;8192,4096""",DT_BF16;DT_BF16,ND;ND,"""1536,4096""",DT_BF16,ND,N/A,486.931,17529512,356.117,0.731,118.847,0.244,295.529,0.607,457.002,0.939,37.938,0.078,0.001,0,0,0,0,0,0,0,0,0,0,0,94.574 +19,4294967295,61776,2,aclnnMatmul_MatMulCommon_MatMulV2,MatMulV2,dynamic,AI_CORE,"1736413971577931.851 ",190.544,1.367,16,0,NO,"""81920,4096;8192,512""",DT_BF16;DT_BF16,ND;ND,"""4096,512""",DT_BF16,ND,N/A,187.073,5387702,151.741,0.811,87.935,0.47,117.467,0.628,181.043,0.968,5.803,0.031,0.001,0,0,0,0,0,0,0,0,0,0,0,78.543 +19,4294967295,61792,2,aclnnMatmul_MatMulV3Common_MatMulV3,MatMulV3,dynamic,AI_CORE,"1736413971579566.403 ",504.071,2.28,20,0,NO,"""81920,1536;8192,4096""",DT_BF16;DT_BF16,ND;ND,"""1536,4096""",DT_BF16,ND,N/A,485.542,17479517,356.283,0.734,117.755,0.243,296.421,0.61,455.064,0.937,37.75,0.078,0.001,0,0,0,0,0,0,0,0,0,0,0,96.324 +19,4294967295,13792,2,aclnnMatmul_MatMulV3Common_MatMulV5,MatMulV3,dynamic,AI_CORE,"1736413974248200.543 ",521.31,2.22,20,0,NO,"""8192,15365;8192,4096""",DT_BF16;DT_BF16,ND;ND,"""1536,4096""",DT_BF16,ND,N/A,499.234,17972434,356.364,0.714,117.639,0.236,295.58,0.592,471.784,0.945,35.825,0.072,0.001,0,0,0,0,0,0,0,0,0,0,0,95.765 +19,4294967295,13792,2,aclnnMatmul_MatMulV3Common_MatMulV5,MatMulV3,dynamic,AI_CORE,"1736413974248200.543 ",521.31,2.22,20,0,NO,"""8192,15365;8192,4096""",DT_BF16;DT_BF16,ND;ND,"""1536,4096""",DT_BF16,ND,N/A,499.234,17972434,356.364,0.714,117.639,0.236,295.58,0.592,471.784,0.945,35.825,0.072,0.001,0,0,0,0,0,0,0,0,0,0,0,95.765 +19,4294967295,13792,2,aclnnMatmul_MatMulV3Common_MatMulV5,MatMulV3,dynamic,AI_CORE,"1736413974248200.543 ",521.31,2.22,20,0,NO,"""8192,15365;8192,4096""",DT_BF16;DT_BF16,ND;ND,"""1536,4096""",DT_BF16,ND,N/A,499.234,17972434,356.364,0.714,117.639,0.236,295.58,0.592,471.784,0.945,35.825,0.072,0.001,0,0,0,0,0,0,0,0,0,0,0,95.765 +19,4294967295,13792,2,aclnnMatmul_MatMulV3Common_MatMulV5,MatMulV3,dynamic,AI_CORE,"1736413974248200.543 ",521.31,2.22,20,0,NO,"""8192,15365;8192,4096""",DT_BF16;DT_BF16,ND;ND,"""1536,4096""",DT_BF16,ND,N/A,499.234,17972434,356.364,0.714,117.639,0.236,295.58,0.592,471.784,0.945,35.825,0.072,0.001,0,0,0,0,0,0,0,0,0,0,0,95.765 +19,4294967295,60679,2,aclnnFlashAttentionScore_FlashAttentionScore_FlashAttentionScore,FlashAttentionScore,dynamic,MIX_AIC,"1736413971411629.128 ",410.188,1.53,20,40,NO,"""4096,2,512;4096,2,512;4096,2,512;;;;4096,4096;;;;;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;UINT8;DT_BF16;BOOL;INT64;INT64;INT64;INT64;INT64,NCL;NCL;NCL;ND;ND;ND;ND;ND;ND;ND;ND;ND,"""2,4,4096,8;2,4,4096,8;;4096,2,512""",FLOAT;FLOAT;DT_BF16;DT_BF16,ND;ND;ND;ND,0,366.147,13181275,129.055,0.352,352.275,0.962,108.364,0.296,172.86,0.872,216.141,0.59,0.003,365.782,26336326,228.687,0.625,137.979,0.377,118.603,0.324,71.448,0.195,0.013,89.263 +19,4294967295,60707,2,aclnnFlashAttentionScore_FlashAttentionScore_FlashAttentionScore,FlashAttentionScore,dynamic,MIX_AIC,"1736413971415611.468 ",406.128,1.279,20,40,NO,"""4096,2,512;4096,2,512;4096,2,512;;;;4096,4096;;;;;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;UINT8;DT_BF16;BOOL;INT64;INT64;INT64;INT64;INT64,NCL;NCL;NCL;ND;ND;ND;ND;ND;ND;ND;ND;ND,"""2,4,4096,8;2,4,4096,8;;4096,2,512""",FLOAT;FLOAT;DT_BF16;DT_BF16,ND;ND;ND;ND,0,358.77,12915719,128.96,0.359,345.096,0.962,108.337,0.302,168.284,0.869,209.057,0.583,0.003,358.308,25798146,228.693,0.638,137.809,0.385,108.679,0.303,70.099,0.196,0.013,88.339 +19,4294967295,60735,2,aclnnFlashAttentionScore_FlashAttentionScore_FlashAttentionScore,FlashAttentionScore,dynamic,MIX_AIC,"1736413971420248.800 ",407.008,0.84,20,40,NO,"""4096,2,512;4096,2,512;4096,2,512;;;;4096,4096;;;;;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;UINT8;DT_BF16;BOOL;INT64;INT64;INT64;INT64;INT64,NCL;NCL;NCL;ND;ND;ND;ND;ND;ND;ND;ND;ND,"""2,4,4096,8;2,4,4096,8;;4096,2,512""",FLOAT;FLOAT;DT_BF16;DT_BF16,ND;ND;ND;ND,0,359.702,12949284,128.975,0.359,346.306,0.963,108.43,0.301,166.899,0.864,209.018,0.581,0.003,359.274,25867705,228.693,0.637,138.438,0.385,107.723,0.3,70.146,0.195,0.013,88.377 +19,4294967295,60763,2,aclnnFlashAttentionScore_FlashAttentionScore_FlashAttentionScore,FlashAttentionScore,dynamic,MIX_AIC,"1736413971424592.447 ",405.228,1.35,20,40,NO,"""4096,2,512;4096,2,512;4096,2,512;;;;4096,4096;;;;;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;UINT8;DT_BF16;BOOL;INT64;INT64;INT64;INT64;INT64,NCL;NCL;NCL;ND;ND;ND;ND;ND;ND;ND;ND;ND,"""2,4,4096,8;2,4,4096,8;;4096,2,512""",FLOAT;FLOAT;DT_BF16;DT_BF16,ND;ND;ND;ND,0,359.793,12952532,128.923,0.358,345.768,0.961,108.411,0.301,167.379,0.865,208.79,0.58,0.003,359.294,25869164,228.691,0.637,138.411,0.385,107.868,0.3,70.163,0.195,0.013,88.788 +19,4294967295,61655,2,aclnnFlashAttentionScoreGrad_FlashAttentionScoreGrad_FlashAttentionScoreGrad,FlashAttentionScoreGrad,dynamic,MIX_AIC,"1736413971559180.676 ",762.215,1.37,20,40,NO,"""4096,2,512;4096,2,512;4096,2,512;4096,2,512;4096,4096;2,4,4096,8;2,4,4096,8;;4096,2,512;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;BOOL;FLOAT;FLOAT;DT_BF16;DT_BF16;INT64,NCL;NCL;NCL;NCL;ND;NCHW;NCHW;ND;NCL;ND,"""4096,2,512;4096,2,512;4096,2,512;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16,ND;ND;ND;ND,0,755.664,27203907,344.023,0.455,592.472,0.784,266.388,0.353,397.091,0.525,589.726,0.525,0.004,755.04,54362915,318.452,0.422,184.623,0.245,206.78,0.274,152.973,0.203,0.006,99.141 +19,4294967295,61696,2,aclnnFlashAttentionScoreGrad_FlashAttentionScoreGrad_FlashAttentionScoreGrad,FlashAttentionScoreGrad,dynamic,MIX_AIC,"1736413971565420.821 ",763.215,1.189,20,40,NO,"""4096,2,512;4096,2,512;4096,2,512;4096,2,512;4096,4096;2,4,4096,8;2,4,4096,8;;4096,2,512;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;BOOL;FLOAT;FLOAT;DT_BF16;DT_BF16;INT64,NCL;NCL;NCL;NCL;ND;NCHW;NCHW;ND;NCL;ND,"""4096,2,512;4096,2,512;4096,2,512;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16,ND;ND;ND;ND,0,757.83,27281885,344.047,0.454,595.954,0.786,266.123,0.351,389.105,0.513,576.226,0.513,0.004,757.046,54507345,318.443,0.421,188.292,0.249,200.176,0.264,162.113,0.214,0.006,99.294 +19,4294967295,61737,2,aclnnFlashAttentionScoreGrad_FlashAttentionScoreGrad_FlashAttentionScoreGrad,FlashAttentionScoreGrad,dynamic,MIX_AIC,"1736413971571804.228 ",757.095,0.88,20,40,NO,"""4096,2,512;4096,2,512;4096,2,512;4096,2,512;4096,4096;2,4,4096,8;2,4,4096,8;;4096,2,512;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;BOOL;FLOAT;FLOAT;DT_BF16;DT_BF16;INT64,NCL;NCL;NCL;NCL;ND;NCHW;NCHW;ND;NCL;ND,"""4096,2,512;4096,2,512;4096,2,512;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16,ND;ND;ND;ND,0,750.605,27021778,343.983,0.458,586.708,0.782,266.304,0.355,392.522,0.523,584.432,0.523,0.004,749.913,53993736,318.436,0.425,188.508,0.251,207.668,0.277,152.634,0.204,0.006,99.143 +19,4294967295,61778,2,aclnnFlashAttentionScoreGrad_FlashAttentionScoreGrad_FlashAttentionScoreGrad,FlashAttentionScoreGrad,dynamic,MIX_AIC,"1736413971578144.095 ",755.915,1.22,20,40,NO,"""4096,2,512;4096,2,512;4096,2,512;4096,2,512;4096,4096;2,4,4096,8;2,4,4096,8;;4096,2,512;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;BOOL;FLOAT;FLOAT;DT_BF16;DT_BF16;INT64,NCL;NCL;NCL;NCL;ND;NCHW;NCHW;ND;NCL;ND,"""4096,2,512;4096,2,512;4096,2,512;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16,ND;ND;ND;ND,0,750.152,27005467,344.115,0.459,579.317,0.772,266.08,0.355,398.019,0.531,587.37,0.531,0.004,749.348,53953058,318.444,0.425,186.908,0.249,207.068,0.276,151.329,0.202,0.006,99.238 +19,4294967295,60763,2,aclnnFlashAttentionScore_FlashAttentionScore_FlashAttentionScore_varlen,FlashAttentionScore,dynamic,MIX_AIC,"1736413971424592.447 ",405.228,1.35,20,40,NO,"""4096,2,511;4096,2,512;4096,2,512;;;;4096,4096;;;;;""",DT_BF16;DT_BF16;DT_BF16;DT_BF16;UINT8;DT_BF16;BOOL;INT64;INT64;INT64;INT64;INT64,NCL;NCL;NCL;ND;ND;ND;ND;ND;ND;ND;ND;ND,"""2,3,4096,8;2,4,4096,8;;4096,2,512""",FLOAT;FLOAT;DT_BF16;DT_BF16,ND;ND;ND;ND,0,359.793,12952532,128.923,0.358,345.768,0.961,108.411,0.301,167.379,0.465,208.79,0.58,0.003,359.294,25869164,228.691,0.637,138.411,0.385,107.868,0.3,70.163,0.195,0.013,88.788 +19,4294967295,60683,2,aclnnAdd_AddAiCore_Add,Add,dynamic,AI_VECTOR_CORE,"1736413971412768.871 ",26.78,0.485,40,0,NO,"""512,2,4096;512,2,4096""",DT_BF16;DT_BF16,NCL;NCL,"""512,2,4096""",DT_BF16,ND,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,24.19,1741674,5.986,0.247,1.352,0.056,20.363,0.842,3.195,0.132,0.027,0 +19,4294967295,60690,2,aclnnAdd_AddAiCore_Add,Add,dynamic,AI_VECTOR_CORE,"1736413971414677.549 ",31.201,0.664,40,0,NO,"""512,2,4096;512,2,4096""",DT_BF16;DT_BF16,NCL;NCL,"""512,2,4096""",DT_BF16,ND,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,28.617,2060443,5.986,0.209,1.444,0.05,25.005,0.874,3.336,0.117,0.026,0 +19,4294967295,60711,2,aclnnAdd_AddAiCore_Add,Add,dynamic,AI_VECTOR_CORE,"1736413971416743.250 ",27.021,1.246,40,0,NO,"""512,2,4096;512,2,4096""",DT_BF16;DT_BF16,NCL;NCL,"""512,2,4096""",DT_BF16,ND,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,24.304,1749862,5.986,0.246,1.258,0.052,20.424,0.84,3.23,0.133,0.027,0 +19,4294967295,60718,2,aclnnAdd_AddAiCore_Add,Add,dynamic,AI_VECTOR_CORE,"1736413971419318.962 ",25.08,0.984,40,0,NO,"""512,2,4096;512,2,4096""",DT_BF16;DT_BF16,NCL;NCL,"""512,2,4096""",DT_BF16,ND,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,22.47,1617840,5.989,0.267,2.009,0.089,18.809,0.837,3.191,0.142,0.024,0 +19,4294967295,13907,2,aclnnAdd_AddAiCore_Add,Add,dynamic,AI_VECTOR_CORE,"1736413974268377.206 ",1.38,31.48,1,0,NO,""";""",FLOAT;FLOAT,ND;ND,"""""",FLOAT,ND,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0.883,1589,0.027,0.03,0.265,0.3,0.18,0.204,0.108,0.123,0.182,0 +19,4294967295,13910,2,aclnnAdd_AddAiCore_Add,Add,dynamic,AI_VECTOR_CORE,"1736413974268502.128 ",1.46,17.48,1,0,NO,""";""",FLOAT;FLOAT,ND;ND,"""""",FLOAT,ND,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0.948,1706,0.027,0.028,0.276,0.291,0.217,0.229,0.127,0.134,0.174,0 +19,4294967295,13913,2,aclnnAdd_AddAiCore_Add,Add,dynamic,AI_VECTOR_CORE,"1736413974268605.410 ",1.5,0.09,1,0,NO,""";""",FLOAT;FLOAT,ND;ND,"""""",FLOAT,ND,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,0.96,1728,0.027,0.028,0.268,0.28,0.221,0.23,0.132,0.137,0.145,0 +19,4294967295,13916,2,aclnnAdd_AddAiCore_Add,Add,dynamic,AI_VECTOR_CORE,"1736413974268747.953 ",1.58,28.28,1,0,NO,""";""",FLOAT;FLOAT,ND;ND,"""""",FLOAT,ND,N/A,0,0,0,0,0,0,0,0,0,0,0,0,0,1.107,1993,0.027,0.024,0.426,0.384,0.201,0.181,0.118,0.106,0.162,0 diff --git a/profiler/msprof_analyze/test/ut/advisor/compute_advice/test_ai_core_performance_advice.py b/profiler/msprof_analyze/test/ut/advisor/compute_advice/test_ai_core_performance_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..e45f6ea3b83d5927df03e73b642959a473dc522d --- /dev/null +++ b/profiler/msprof_analyze/test/ut/advisor/compute_advice/test_ai_core_performance_advice.py @@ -0,0 +1,74 @@ +import csv +import os +import shutil +import stat + +import unittest +from profiler.msprof_analyze.advisor.interface.interface import Interface +from profiler.msprof_analyze.advisor.common.analyzer_scopes import SupportedScopes + + +class TestAICorePerformanceAdvice(unittest.TestCase): + TMP_DIR = "./ascend_pt" + OUTPUT_DIR = "./ascend_pt/ASCEND_PROFILER_OUTPUT" + interface = None + err_interface = None + + def tearDown(self): + if os.path.exists(TestAICorePerformanceAdvice.TMP_DIR): + shutil.rmtree(TestAICorePerformanceAdvice.TMP_DIR) + self.clear_htmls() + + def setUp(self): + if os.path.exists(TestAICorePerformanceAdvice.TMP_DIR): + shutil.rmtree(TestAICorePerformanceAdvice.TMP_DIR) + if not os.path.exists(TestAICorePerformanceAdvice.TMP_DIR): + os.makedirs(TestAICorePerformanceAdvice.TMP_DIR) + if not os.path.exists(TestAICorePerformanceAdvice.OUTPUT_DIR): + os.makedirs(TestAICorePerformanceAdvice.OUTPUT_DIR) + self.clear_htmls() + + @classmethod + def clear_htmls(cls): + current_path = os.path.dirname(os.path.abspath(__file__)) + for filename in os.listdir(current_path): + # 检查文件是否以“mstt”开头 + if filename.startswith("mstt"): + # 构建文件的完整路径 + file_path = os.path.join(current_path, filename) + # 删除文件 + os.remove(file_path) + + @classmethod + def copy_kernel_details(cls, path): + # Define source and destination paths + source_csv_path = f"./data/{path}" + destination_csv_path = f"{TestAICorePerformanceAdvice.OUTPUT_DIR}/kernel_details.csv" + + # Check if source CSV file exists + if not os.path.exists(source_csv_path): + raise FileNotFoundError(f"test data file not found:{source_csv_path}") + + # Ensure the output directory exists + if not os.path.exists(TestAICorePerformanceAdvice.OUTPUT_DIR): + os.makedirs(TestAICorePerformanceAdvice.OUTPUT_DIR) + + # Copy the CSV file from source to destination + shutil.copyfile(source_csv_path, destination_csv_path) + + def test_ai_core_performance_total(self): + file_path = "kernel_details.csv" + self.copy_kernel_details(file_path) + interface = Interface(profiling_path=self.TMP_DIR) + dimension = Interface.COMPUTATION + scope = SupportedScopes.AICORE_PERFORMANCE_ANALYSIS + result = interface.get_result(dimension, scope, render_html=1, output_dict=False, profiling_path=self.TMP_DIR) + self.assertLess(1, len(result.data.get("Cube算子性能分析").get("data")[0])) + self.assertLess(1, len(result.data.get("Cube算子性能分析").get("data")[1])) + self.assertLess(1, len(result.data.get("Cube算子性能分析").get("data")[2])) + self.assertLess(1, len(result.data.get("FA算子性能分析").get("data")[0])) + self.assertLess(1, len(result.data.get("FA算子性能分析").get("data")[1])) + self.assertLess(1, len(result.data.get("FA算子性能分析").get("data")[2])) + self.assertLess(1, len(result.data.get("Vector算子性能分析").get("data")[0])) + self.assertLess(1, len(result.data.get("Vector算子性能分析").get("data")[1])) + result.clear()