diff --git a/.gitignore b/.gitignore index a81c8ee121952cf06bfaf9ff9988edd8cded763c..c70c40e0f527c8c20a6bf994bcb8070b95e13e27 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,10 @@ dmypy.json # Cython debug symbols cython_debug/ + +# vscode settings and analysis output +.vscode/ +att_advisor*.html +*.xlsx +operator_tuning_file*.cfg +.ipynb_checkpoints/ \ No newline at end of file diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py index 7e0ce3b8968f4cfe64da3ad33eb09eb1e6e50ab0..5f4bd3202cd2071088f25564a7d4b14144a34826 100644 --- a/profiler/advisor/analyzer/base_analyzer.py +++ b/profiler/advisor/analyzer/base_analyzer.py @@ -17,11 +17,10 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): dataset_cls_list = [] - def __init__(self, collection_path, n_processes: int = 1, cann_version=constant.DEFAULT_CANN_VERSION, - torch_version=constant.DEFAULT_TORCH_VERSION, **kwargs): + def __init__(self, collection_path, n_processes: int = 1, **kwargs): self.n_processes = n_processes - self.cann_version = cann_version - self.torch_version = torch_version + self.cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION) + self.torch_version = kwargs.get("torch_version", constant.DEFAULT_TORCH_VERSION) self.html_render = HTMLRender() self.collection_path = collection_path self.kwargs = kwargs @@ -41,7 +40,7 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): def decorate(func): @wraps(func) - def wrapper(self): + def wrapper(self, **kwargs): data = self.dataset_list if data is None: return None @@ -57,7 +56,7 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): return decorate @abstractmethod - def optimize(self): + def optimize(self, **kwargs): pass @abstractmethod diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyser.py index a4c38187a792d6bde3ed5426b62b165c87627acf..846b79a50f31abb8445a0e5c2e82aaaf3c8ee23d 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyser.py @@ -37,9 +37,8 @@ class SlowLinkAnalyzer(BaseAnalyzer): SLOW_LINK_ANALYSIS = "slow_link_analysis" dataset_cls_list = [ClusterCommunicationDataSet] - def __init__(self, collection_path, n_processes: int = 1, cann_version=constant.DEFAULT_CANN_VERSION, - torch_version=constant.DEFAULT_TORCH_VERSION, **kwargs): - super().__init__(collection_path, n_processes, cann_version, torch_version, **kwargs) + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) key = ClusterCommunicationDataSet.get_key() self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key) self.rank_bw_dict = self.communication_data_class.get_data() diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py index ff80178e239feed79809c72e306356b9e305017e..4215b514a215a2a350571746ff9cb90c3c9956eb 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py @@ -29,9 +29,8 @@ class SlowRankAnalyzer(BaseAnalyzer): BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] dataset_cls_list = [ClusterStepTraceTimeDataSet] - def __init__(self, collection_path, n_processes: int = 1, cann_version=constant.DEFAULT_CANN_VERSION, - torch_version=constant.DEFAULT_TORCH_VERSION, **kwargs): - super().__init__(collection_path, n_processes, cann_version, torch_version, **kwargs) + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) key = ClusterStepTraceTimeDataSet.get_key() self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key) self.step_trace_dict = self.step_trace_class.get_data() diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index d90ef56c78b52987bc7d1dbb2c17d5caf9eb7706..a7d7ddd93c70e59dc0d10318fdac06fdc581f70c 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -20,7 +20,11 @@ class BlockDimChecker(OperatorChecker): "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats" ] + def pre_check(self, profiling_data) -> bool: + return not self.is_dynamic_shape(profiling_data) + def _check_data(self, data): + self.format_suggestion_content(data) if not self._check_summary(data): return False if not Config().get_config("ai_core_num"): @@ -69,9 +73,3 @@ class BlockDimChecker(OperatorChecker): else: core_num = self._aiv_num return core_num - - def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: - if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER: - self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) - elif profiling_data.PROF_TYPE == constant.MSLITE: - self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) diff --git a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py index 4ede3c94e6703f2ce38f9339db8fe9405fcfa82f..a22b380f974b14207d6d7be262cd49f0ba0fbe99 100644 --- a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py +++ b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py @@ -24,12 +24,15 @@ class OperatorBoundChecker(OperatorChecker): "output_data_types", "output_formats" ] + def pre_check(self, profiling_data) -> bool: + return not self.is_dynamic_shape(profiling_data) + def _check_data(self, data): + self.format_suggestion_content(data) if not self._check_summary(data): return False for op_info in data.op_summary.op_list: - if self._check_operator(op_info): - return True + return self._check_operator(op_info) logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ratio in op summary") return False @@ -48,9 +51,3 @@ class OperatorBoundChecker(OperatorChecker): template_dir="templates", template_name="operator_no_bound.html", format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK)) - - def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: - if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER: - self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) - elif profiling_data.PROF_TYPE == constant.MSLITE: - self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) diff --git a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py index 070b3a3b57b0a3d850a2e34bf408c5cf6c2a9610..86d3bac4ff8cb163d23a6365307b855839b12a6a 100644 --- a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py +++ b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -25,33 +25,7 @@ class DynamicShapeChecker(OperatorChecker): super().__init__(cann_version=cann_version) def check(self, profiling_database) -> bool: - less_than_cann800_list = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15] - # CANN 8.0.0 之前从 ge_info 中获取 op_state 属性,进行动态 shape 逻辑判断 - if self.cann_version in less_than_cann800_list: - if hasattr(profiling_database, "ge_info"): - ge_info = profiling_database.ge_info - static_shape_operators = ge_info.get_static_shape_operators() - if len(static_shape_operators) == 0: - OperatorChecker.IS_ALL_OPERATOR_DYNAMIC_SHAPE = True - return True - else: - logger.warning( - "Skip dynamic shape checker because of not containing ge_info.db file in host filefloder.\n" - "To enable dynamic shape checker, please try to set data_simplification=False in experimental_config.\n" - "More details please refer to link : %s", constant.ASCEND_PROFILER_URL) - else: - # CANN 8.0.0 之后 op_state 属性从 op_summary 文件中获取 - if hasattr(profiling_database, "op_summary"): - static_shape_operators = profiling_database.op_summary.get_static_shape_operators() - if len(static_shape_operators) == 0: - OperatorChecker.IS_ALL_OPERATOR_DYNAMIC_SHAPE = True - return True - else: - logger.warning( - "Skip dynamic shape checker because of not containing op_summary.csv file in current filefloder." - ) - - return False + return self.is_dynamic_shape(profiling_database) def make_record(self, profiling_database) -> OptimizeRecord: """ diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py index 6bb837004b4282e406d8e1c7b3c5c2a135b9be0c..0f47650943a7355b494bd766214d10526c46c0fa 100644 --- a/profiler/advisor/analyzer/computation/operator_checker.py +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -15,8 +15,7 @@ logger = logging.getLogger() class OperatorChecker(VersionControl): - _SUPPORT_VERSIONS = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15, constant.CANN_VERSION_C17] - IS_ALL_OPERATOR_DYNAMIC_SHAPE = False + _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION _MAX_TUNE_OP_NUM = constant.OPERATOR_OUT_TOPK _MIN_TASK_DURATION = 0 _MIN_TASK_DURATION_RATIO = 1.0 @@ -115,10 +114,33 @@ class OperatorChecker(VersionControl): return description def pre_check(self, profiling_data) -> bool: - self.format_suggestion_content(profiling_data) - return not (OperatorChecker.IS_ALL_OPERATOR_DYNAMIC_SHAPE and ( - OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION or OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION - ) in self._SUGGESTION) + return True + + def is_dynamic_shape(self, profiling_database: ProfilingDataset) -> bool: + less_than_cann800_list = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15] + # CANN 8.0.0 之前从 ge_info 中获取 op_state 属性,进行动态 shape 逻辑判断 + if self.cann_version in less_than_cann800_list: + if hasattr(profiling_database, "ge_info"): + ge_info = profiling_database.ge_info + static_shape_operators = ge_info.get_static_shape_operators() + if len(static_shape_operators) == 0: + return True + else: + logger.warning( + "Skip dynamic shape check because of not containing ge_info.db file in host filefloder.\n" + "To enable dynamic shape check, please try to set data_simplification=False in experimental_config.\n" + "More details please refer to link : %s", constant.ASCEND_PROFILER_URL) + else: + # CANN 8.0.0 之后 op_state 属性从 op_summary 文件中获取 + if hasattr(profiling_database, "op_summary"): + static_shape_operators = profiling_database.op_summary.get_static_shape_operators() + if len(static_shape_operators) == 0: + return True + else: + logger.warning( + "Skip dynamic shape check because of not containing op_summary.csv file in current filefloder." + ) + return False def format_operator_result(self, record, limit): """ @@ -279,4 +301,7 @@ class OperatorChecker(VersionControl): return details def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: - return + if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER: + self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) + elif profiling_data.PROF_TYPE == constant.MSLITE: + self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) diff --git a/profiler/advisor/analyzer/computation/profiling_analyzer.py b/profiler/advisor/analyzer/computation/profiling_analyzer.py index 98d3c5c49b74362137126ec1276c3684284662f0..8682617700702055628a31982b0eafab9feb336d 100644 --- a/profiler/advisor/analyzer/computation/profiling_analyzer.py +++ b/profiler/advisor/analyzer/computation/profiling_analyzer.py @@ -22,45 +22,40 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): dataset_cls_list = [ProfilingDataset] def __init__(self, collection_path, **kwargs) -> None: - cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION) - torch_version = kwargs.get("torch_version", constant.DEFAULT_TORCH_VERSION) - super().__init__(collection_path, cann_version=cann_version, torch_version=torch_version, **kwargs) - self.checker_list = [checker(cann_version) for checker in get_supported_subclass(OperatorChecker, cann_version)] - # 动态 shape checker 放到首位,因为动态 shape 情形下AOE算子调优现在不支持,AOE 算子调优 checker 可以跳过 - index = next((i for i, item in enumerate(self.checker_list) if isinstance(item, DynamicShapeChecker)), None) - self.checker_list.insert(0, self.checker_list.pop(index)) + super().__init__(collection_path, **kwargs) + self.checker = OperatorChecker(self.cann_version) self.html_render = HTMLRender() self.result = OptimizeResult() @BaseAnalyzer.check_data((ProfilingDataset.get_key(),)) - def optimize(self) -> OptimizeResult: + def optimize(self, **kwargs) -> OptimizeResult: """ optimize operator :param data: input datasets :return: result """ profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key()) - for checker in self.checker_list: - if not checker.pre_check(profiling_data): - continue - if checker.check(profiling_data): - # add record - record = checker.make_record(profiling_data) - checker.make_render(self.html_render, record) - self.result.add(record) - # add details - details = checker.get_details() - if details: - for i, detail in enumerate(details): - if i == 0: - # the first row is header - self.result.add_detail(checker.get_name(), headers=detail) - else: - self.result.add_detail(checker.get_name(), detail=detail) - # add tune op list - tune_op_list = checker.get_tune_op_list() - if tune_op_list: - self.result.add_tune_op_list(tune_op_list) + checker = self.checker + if not checker.pre_check(profiling_data): + return self.result + if checker.check(profiling_data): + # add record + record = checker.make_record(profiling_data) + checker.make_render(self.html_render, record) + self.result.add(record) + # add details + details = checker.get_details() + if details: + for i, detail in enumerate(details): + if i == 0: + # the first row is header + self.result.add_detail(checker.get_name(), headers=detail) + else: + self.result.add_detail(checker.get_name(), detail=detail) + # add tune op list + tune_op_list = checker.get_tune_op_list() + if tune_op_list: + self.result.add_tune_op_list(tune_op_list) return self.result @@ -69,3 +64,26 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): def make_render(self): pass + + +class DynamicShapeAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = DynamicShapeChecker(self.cann_version) + + +class BlockDimAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = BlockDimChecker(self.cann_version) + + +class OperatorBoundAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = OperatorBoundChecker(self.cann_version) + +class AicpuAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = AicpuChecker(self.cann_version) \ No newline at end of file diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py index 713e1184299944ce506afbd48c4c2f1ec3f7d6e4..326be83b8d49088b1563ccd8c08b68a4aa3001ef 100644 --- a/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py +++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py @@ -22,7 +22,7 @@ class FusionOPAnalyzer(BaseAnalyzer): self.html_render = HTMLRender() @BaseAnalyzer.check_data((GraphDataset.get_key(),)) - def optimize(self): + def optimize(self, **kwargs): """ :return: result """ diff --git a/profiler/advisor/analyzer/overall/overall_analyzer.py b/profiler/advisor/analyzer/overall/overall_analyzer.py index 7e5102bcfb6937691cfa4b2a962f7aa69c18d35f..916a396b3d096dc788954cbc8e8ba9755cd15f4e 100644 --- a/profiler/advisor/analyzer/overall/overall_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_analyzer.py @@ -18,7 +18,7 @@ class OverallSummaryAnalyzer(BaseAnalyzer): self.html_render = HTMLRender() self.result = OptimizeResult() - def optimize(self): + def optimize(self, **kwargs): compare_result = ComparisonInterface(self.benchmark_profiling_path, self.profiling_path).compare( Constant.OVERALL_COMPARE) diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index e810994cd4d4bea76bc506b2121016009f380ecc..c74ae0510331fb9ba8a1794bd724710ba19cfabf 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -53,10 +53,9 @@ class OverallSummaryAnalyzer(BaseAnalyzer): "Free Time": ['SDMA Time(Num)'] } - def __init__(self, collection_path: str, n_processes: int = 1, cann_version=const.DEFAULT_CANN_VERSION, - torch_version=const.DEFAULT_TORCH_VERSION, **kwargs): + def __init__(self, collection_path: str, n_processes: int = 1, **kwargs): profile_path = get_profile_path(collection_path) - super().__init__(profile_path, n_processes, cann_version, torch_version, **kwargs) + super().__init__(profile_path, n_processes, **kwargs) self.base_collection_path = kwargs.get("base_collection_path", "") self._has_base_collection = False self._is_minimal_profiling = False @@ -179,7 +178,7 @@ class OverallSummaryAnalyzer(BaseAnalyzer): self.cur_bottleneck["overall_data"] = overall_bottleneck if comparison_bottleneck: self.cur_bottleneck["comparison_result"] = comparison_bottleneck - def optimize(self): + def optimize(self, **kwargs): if self.path_check(): self.process() self.identify_bottleneck() diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py index 01613dbe328d513048eb2e1539ac4a19f0c5d587..c1eb24b8e1e11ac167a7eb9333867167a57dd524 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -19,9 +19,8 @@ logger = logging.getLogger() class TimelineFusionOpsAnalyzer(BaseAnalyzer): dataset_cls_list = [TimelineEventDataset] - def __init__(self, collection_path, n_processes: int = 1, cann_version=const.DEFAULT_CANN_VERSION, - torch_version=const.DEFAULT_TORCH_VERSION, **kwargs): - super().__init__(collection_path, n_processes, cann_version, torch_version, **kwargs) + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict() self.matched_op_stacks = {} self.empty_stacks = True diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py index 44f09d0a58b8ec5327846572af65f7161ee93e43..bfe49873a99ad1136b2c385457fb2644df992080 100644 --- a/profiler/advisor/common/analyzer_scopes.py +++ b/profiler/advisor/common/analyzer_scopes.py @@ -7,4 +7,7 @@ class SupportedScopes: SLOW_RANK = "slow_rank" SLOW_LINK = "slow_link" OVER_ALL = "over_all" - PROFILING_OPERATOR_ANALYSIS = "profiling_operator_analysis" + DYNAMIC_SHAPE_ANALYSIS = "dynamic_shape_analysis" + AICPU_ANALYSIS = "aicpu_analysis" + BLOCK_DIM_ANALYSIS = "block_dim_analysis" + OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" diff --git a/profiler/advisor/computation_analysis.ipynb b/profiler/advisor/computation_analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..bb9cb1112540a188a62c52a5a83a7c1d210b7571 --- /dev/null +++ b/profiler/advisor/computation_analysis.ipynb @@ -0,0 +1,770 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Block Dim问题识别\n", + "\n", + "Block Dim问题主要为识别相关core算子AI core核未打满或者Vector 核未打满问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Block Dim问题.\n", + "\n", + "下列代码为样例,主要展示如何检测Block Dim类型问题,并获取相关问题检测结果:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 查询computation相关是否存在block dim问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "block_dim_result = interface.get_result(\"computation\", \"block_dim_analysis\", cann_version=\"7.0.RC1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_type | \n", + "task_duration | \n", + "income | \n", + "block_dim | \n", + "mix_block_dim | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|
Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/Square-op34Default/model- LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/ReduceMean-op35 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.76 | \n", + "0 | \n", + "16 | \n", + "0 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/Square-op77Default/model- LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/ReduceMean-op78 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.24 | \n", + "0 | \n", + "16 | \n", + "0 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/lm_head-Linear/MatMul-op213 | \n", + "MatMulV2 | \n", + "AI_CORE | \n", + "39.02 | \n", + "0 | \n", + "20 | \n", + "0 | \n", + ""128,128;128,32000" | \n", + "FLOAT16;FLOAT16 | \n", + "FORMAT_ND;FORMAT_ND | \n", + ""128,32000" | \n", + "FLOAT | \n", + "FORMAT_ND | \n", + "
problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
operator no bound | \n", + "There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | \n",
+ " 95 | \n", + "814.0199999999999 | \n", + "0.7985 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_type | \n", + "task_duration | \n", + "vec_ratio | \n", + "mac_ratio | \n", + "scalar_ratio | \n", + "mte1_ratio | \n", + "mte2_ratio | \n", + "mte3_ratio | \n", + "block_dim | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/ReduceMean-op35 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.76 | \n", + "0.4654 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0056 | \n", + "16 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/Square- op77Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/ReduceMean-op78 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.24 | \n", + "0.466 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0062 | \n", + "16 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/lm_head-Linear/MatMul-op213 | \n", + "MatMulV2 | \n", + "AI_CORE | \n", + "39.02 | \n", + "0 | \n", + "0.1105 | \n", + "0.0119 | \n", + "0.0857 | \n", + "0.4284 | \n", + "0 | \n", + "20 | \n", + ""128,128;128,32000" | \n", + "FLOAT16;FLOAT16 | \n", + "FORMAT_ND;FORMAT_ND | \n", + ""128,32000" | \n", + "FLOAT | \n", + "FORMAT_ND | \n", + "
problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
operator no bound | \n", + "There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | \n",
+ " 95 | \n", + "814.0199999999999 | \n", + "0.7985 | \n", + "\n", + " | \n", + " |
AICPU operator | \n", + "Some operators and task duration exceed 20 us, such as : Cast | \n",
+ " 1. Modify code to avoid aicpu operator | \n", + "39 | \n", + "686568.860000001 | \n", + "0.0189 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_duration | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "stack_info | \n", + "
---|---|---|---|---|---|---|---|---|---|
trans_Cast_5 | \n", + "Cast | \n", + "493.64 | \n", + """ | \n", + "INT32 | \n", + "FORMAT_ND | \n", + """ | \n", + "UINT64 | \n", + "FORMAT_ND | \n", + "/usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/functional.py(1279): dropout; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/dropout.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/l anguage_model.py(236): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/l anguage_model.py(425): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/g pt_model.py(84): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/m odule.py(184): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/d istributed.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; ../../pretrain_gpt.py(88): forward_step; /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; /home/s30040711/Megatron-LM/megatro n_npu_adaptor/megatron_npu/adaptor_schedules.py(96 ): forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): train_step; /profiling_auto_GPT3/megatron/training.py(837): train; /profiling_auto_GPT3/megatron/training.py(152): pretrain; ../../pretrain_gpt.py(122): <module> | \n",
+ "
trans_Cast_5 | \n", + "Cast | \n", + "413.4 | \n", + """ | \n", + "INT32 | \n", + "FORMAT_ND | \n", + """ | \n", + "UINT64 | \n", + "FORMAT_ND | \n", + "/usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/functional.py(1279): dropout; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/dropout.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/l anguage_model.py(236): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/l anguage_model.py(425): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/g pt_model.py(84): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/m odule.py(184): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/d istributed.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; ../../pretrain_gpt.py(88): forward_step; /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; /home/s30040711/Megatron-LM/megatro n_npu_adaptor/megatron_npu/adaptor_schedules.py(10 9): forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): train_step; /profiling_auto_GPT3/megatron/training.py(837): train; /profiling_auto_GPT3/megatron/training.py(152): pretrain; ../../pretrain_gpt.py(122): <module> | \n",
+ "