diff --git a/.gitignore b/.gitignore index a81c8ee121952cf06bfaf9ff9988edd8cded763c..c70c40e0f527c8c20a6bf994bcb8070b95e13e27 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,10 @@ dmypy.json # Cython debug symbols cython_debug/ + +# vscode settings and analysis output +.vscode/ +att_advisor*.html +*.xlsx +operator_tuning_file*.cfg +.ipynb_checkpoints/ \ No newline at end of file diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py index 7e0ce3b8968f4cfe64da3ad33eb09eb1e6e50ab0..5f4bd3202cd2071088f25564a7d4b14144a34826 100644 --- a/profiler/advisor/analyzer/base_analyzer.py +++ b/profiler/advisor/analyzer/base_analyzer.py @@ -17,11 +17,10 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): dataset_cls_list = [] - def __init__(self, collection_path, n_processes: int = 1, cann_version=constant.DEFAULT_CANN_VERSION, - torch_version=constant.DEFAULT_TORCH_VERSION, **kwargs): + def __init__(self, collection_path, n_processes: int = 1, **kwargs): self.n_processes = n_processes - self.cann_version = cann_version - self.torch_version = torch_version + self.cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION) + self.torch_version = kwargs.get("torch_version", constant.DEFAULT_TORCH_VERSION) self.html_render = HTMLRender() self.collection_path = collection_path self.kwargs = kwargs @@ -41,7 +40,7 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): def decorate(func): @wraps(func) - def wrapper(self): + def wrapper(self, **kwargs): data = self.dataset_list if data is None: return None @@ -57,7 +56,7 @@ class BaseAnalyzer(VersionControl, metaclass=ABCMeta): return decorate @abstractmethod - def optimize(self): + def optimize(self, **kwargs): pass @abstractmethod diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyser.py index a4c38187a792d6bde3ed5426b62b165c87627acf..846b79a50f31abb8445a0e5c2e82aaaf3c8ee23d 100644 --- a/profiler/advisor/analyzer/cluster/slow_link_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_link_analyser.py @@ -37,9 +37,8 @@ class SlowLinkAnalyzer(BaseAnalyzer): SLOW_LINK_ANALYSIS = "slow_link_analysis" dataset_cls_list = [ClusterCommunicationDataSet] - def __init__(self, collection_path, n_processes: int = 1, cann_version=constant.DEFAULT_CANN_VERSION, - torch_version=constant.DEFAULT_TORCH_VERSION, **kwargs): - super().__init__(collection_path, n_processes, cann_version, torch_version, **kwargs) + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) key = ClusterCommunicationDataSet.get_key() self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key) self.rank_bw_dict = self.communication_data_class.get_data() diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py index ff80178e239feed79809c72e306356b9e305017e..4215b514a215a2a350571746ff9cb90c3c9956eb 100644 --- a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py @@ -29,9 +29,8 @@ class SlowRankAnalyzer(BaseAnalyzer): BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] dataset_cls_list = [ClusterStepTraceTimeDataSet] - def __init__(self, collection_path, n_processes: int = 1, cann_version=constant.DEFAULT_CANN_VERSION, - torch_version=constant.DEFAULT_TORCH_VERSION, **kwargs): - super().__init__(collection_path, n_processes, cann_version, torch_version, **kwargs) + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) key = ClusterStepTraceTimeDataSet.get_key() self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key) self.step_trace_dict = self.step_trace_class.get_data() diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py index d90ef56c78b52987bc7d1dbb2c17d5caf9eb7706..a7d7ddd93c70e59dc0d10318fdac06fdc581f70c 100644 --- a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -20,7 +20,11 @@ class BlockDimChecker(OperatorChecker): "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats" ] + def pre_check(self, profiling_data) -> bool: + return not self.is_dynamic_shape(profiling_data) + def _check_data(self, data): + self.format_suggestion_content(data) if not self._check_summary(data): return False if not Config().get_config("ai_core_num"): @@ -69,9 +73,3 @@ class BlockDimChecker(OperatorChecker): else: core_num = self._aiv_num return core_num - - def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: - if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER: - self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) - elif profiling_data.PROF_TYPE == constant.MSLITE: - self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) diff --git a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py index 4ede3c94e6703f2ce38f9339db8fe9405fcfa82f..a22b380f974b14207d6d7be262cd49f0ba0fbe99 100644 --- a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py +++ b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py @@ -24,12 +24,15 @@ class OperatorBoundChecker(OperatorChecker): "output_data_types", "output_formats" ] + def pre_check(self, profiling_data) -> bool: + return not self.is_dynamic_shape(profiling_data) + def _check_data(self, data): + self.format_suggestion_content(data) if not self._check_summary(data): return False for op_info in data.op_summary.op_list: - if self._check_operator(op_info): - return True + return self._check_operator(op_info) logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ratio in op summary") return False @@ -48,9 +51,3 @@ class OperatorBoundChecker(OperatorChecker): template_dir="templates", template_name="operator_no_bound.html", format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK)) - - def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: - if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER: - self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) - elif profiling_data.PROF_TYPE == constant.MSLITE: - self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) diff --git a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py index 070b3a3b57b0a3d850a2e34bf408c5cf6c2a9610..86d3bac4ff8cb163d23a6365307b855839b12a6a 100644 --- a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py +++ b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -25,33 +25,7 @@ class DynamicShapeChecker(OperatorChecker): super().__init__(cann_version=cann_version) def check(self, profiling_database) -> bool: - less_than_cann800_list = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15] - # CANN 8.0.0 之前从 ge_info 中获取 op_state 属性,进行动态 shape 逻辑判断 - if self.cann_version in less_than_cann800_list: - if hasattr(profiling_database, "ge_info"): - ge_info = profiling_database.ge_info - static_shape_operators = ge_info.get_static_shape_operators() - if len(static_shape_operators) == 0: - OperatorChecker.IS_ALL_OPERATOR_DYNAMIC_SHAPE = True - return True - else: - logger.warning( - "Skip dynamic shape checker because of not containing ge_info.db file in host filefloder.\n" - "To enable dynamic shape checker, please try to set data_simplification=False in experimental_config.\n" - "More details please refer to link : %s", constant.ASCEND_PROFILER_URL) - else: - # CANN 8.0.0 之后 op_state 属性从 op_summary 文件中获取 - if hasattr(profiling_database, "op_summary"): - static_shape_operators = profiling_database.op_summary.get_static_shape_operators() - if len(static_shape_operators) == 0: - OperatorChecker.IS_ALL_OPERATOR_DYNAMIC_SHAPE = True - return True - else: - logger.warning( - "Skip dynamic shape checker because of not containing op_summary.csv file in current filefloder." - ) - - return False + return self.is_dynamic_shape(profiling_database) def make_record(self, profiling_database) -> OptimizeRecord: """ diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py index 6bb837004b4282e406d8e1c7b3c5c2a135b9be0c..0f47650943a7355b494bd766214d10526c46c0fa 100644 --- a/profiler/advisor/analyzer/computation/operator_checker.py +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -15,8 +15,7 @@ logger = logging.getLogger() class OperatorChecker(VersionControl): - _SUPPORT_VERSIONS = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15, constant.CANN_VERSION_C17] - IS_ALL_OPERATOR_DYNAMIC_SHAPE = False + _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION _MAX_TUNE_OP_NUM = constant.OPERATOR_OUT_TOPK _MIN_TASK_DURATION = 0 _MIN_TASK_DURATION_RATIO = 1.0 @@ -115,10 +114,33 @@ class OperatorChecker(VersionControl): return description def pre_check(self, profiling_data) -> bool: - self.format_suggestion_content(profiling_data) - return not (OperatorChecker.IS_ALL_OPERATOR_DYNAMIC_SHAPE and ( - OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION or OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION - ) in self._SUGGESTION) + return True + + def is_dynamic_shape(self, profiling_database: ProfilingDataset) -> bool: + less_than_cann800_list = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15] + # CANN 8.0.0 之前从 ge_info 中获取 op_state 属性,进行动态 shape 逻辑判断 + if self.cann_version in less_than_cann800_list: + if hasattr(profiling_database, "ge_info"): + ge_info = profiling_database.ge_info + static_shape_operators = ge_info.get_static_shape_operators() + if len(static_shape_operators) == 0: + return True + else: + logger.warning( + "Skip dynamic shape check because of not containing ge_info.db file in host filefloder.\n" + "To enable dynamic shape check, please try to set data_simplification=False in experimental_config.\n" + "More details please refer to link : %s", constant.ASCEND_PROFILER_URL) + else: + # CANN 8.0.0 之后 op_state 属性从 op_summary 文件中获取 + if hasattr(profiling_database, "op_summary"): + static_shape_operators = profiling_database.op_summary.get_static_shape_operators() + if len(static_shape_operators) == 0: + return True + else: + logger.warning( + "Skip dynamic shape check because of not containing op_summary.csv file in current filefloder." + ) + return False def format_operator_result(self, record, limit): """ @@ -279,4 +301,7 @@ class OperatorChecker(VersionControl): return details def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: - return + if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER: + self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) + elif profiling_data.PROF_TYPE == constant.MSLITE: + self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) diff --git a/profiler/advisor/analyzer/computation/profiling_analyzer.py b/profiler/advisor/analyzer/computation/profiling_analyzer.py index 98d3c5c49b74362137126ec1276c3684284662f0..8682617700702055628a31982b0eafab9feb336d 100644 --- a/profiler/advisor/analyzer/computation/profiling_analyzer.py +++ b/profiler/advisor/analyzer/computation/profiling_analyzer.py @@ -22,45 +22,40 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): dataset_cls_list = [ProfilingDataset] def __init__(self, collection_path, **kwargs) -> None: - cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION) - torch_version = kwargs.get("torch_version", constant.DEFAULT_TORCH_VERSION) - super().__init__(collection_path, cann_version=cann_version, torch_version=torch_version, **kwargs) - self.checker_list = [checker(cann_version) for checker in get_supported_subclass(OperatorChecker, cann_version)] - # 动态 shape checker 放到首位,因为动态 shape 情形下AOE算子调优现在不支持,AOE 算子调优 checker 可以跳过 - index = next((i for i, item in enumerate(self.checker_list) if isinstance(item, DynamicShapeChecker)), None) - self.checker_list.insert(0, self.checker_list.pop(index)) + super().__init__(collection_path, **kwargs) + self.checker = OperatorChecker(self.cann_version) self.html_render = HTMLRender() self.result = OptimizeResult() @BaseAnalyzer.check_data((ProfilingDataset.get_key(),)) - def optimize(self) -> OptimizeResult: + def optimize(self, **kwargs) -> OptimizeResult: """ optimize operator :param data: input datasets :return: result """ profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key()) - for checker in self.checker_list: - if not checker.pre_check(profiling_data): - continue - if checker.check(profiling_data): - # add record - record = checker.make_record(profiling_data) - checker.make_render(self.html_render, record) - self.result.add(record) - # add details - details = checker.get_details() - if details: - for i, detail in enumerate(details): - if i == 0: - # the first row is header - self.result.add_detail(checker.get_name(), headers=detail) - else: - self.result.add_detail(checker.get_name(), detail=detail) - # add tune op list - tune_op_list = checker.get_tune_op_list() - if tune_op_list: - self.result.add_tune_op_list(tune_op_list) + checker = self.checker + if not checker.pre_check(profiling_data): + return self.result + if checker.check(profiling_data): + # add record + record = checker.make_record(profiling_data) + checker.make_render(self.html_render, record) + self.result.add(record) + # add details + details = checker.get_details() + if details: + for i, detail in enumerate(details): + if i == 0: + # the first row is header + self.result.add_detail(checker.get_name(), headers=detail) + else: + self.result.add_detail(checker.get_name(), detail=detail) + # add tune op list + tune_op_list = checker.get_tune_op_list() + if tune_op_list: + self.result.add_tune_op_list(tune_op_list) return self.result @@ -69,3 +64,26 @@ class ProfilingAnalyzer(BaseAnalyzer, ABC): def make_render(self): pass + + +class DynamicShapeAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = DynamicShapeChecker(self.cann_version) + + +class BlockDimAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = BlockDimChecker(self.cann_version) + + +class OperatorBoundAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = OperatorBoundChecker(self.cann_version) + +class AicpuAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = AicpuChecker(self.cann_version) \ No newline at end of file diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py index 713e1184299944ce506afbd48c4c2f1ec3f7d6e4..326be83b8d49088b1563ccd8c08b68a4aa3001ef 100644 --- a/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py +++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py @@ -22,7 +22,7 @@ class FusionOPAnalyzer(BaseAnalyzer): self.html_render = HTMLRender() @BaseAnalyzer.check_data((GraphDataset.get_key(),)) - def optimize(self): + def optimize(self, **kwargs): """ :return: result """ diff --git a/profiler/advisor/analyzer/overall/overall_analyzer.py b/profiler/advisor/analyzer/overall/overall_analyzer.py index 7e5102bcfb6937691cfa4b2a962f7aa69c18d35f..916a396b3d096dc788954cbc8e8ba9755cd15f4e 100644 --- a/profiler/advisor/analyzer/overall/overall_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_analyzer.py @@ -18,7 +18,7 @@ class OverallSummaryAnalyzer(BaseAnalyzer): self.html_render = HTMLRender() self.result = OptimizeResult() - def optimize(self): + def optimize(self, **kwargs): compare_result = ComparisonInterface(self.benchmark_profiling_path, self.profiling_path).compare( Constant.OVERALL_COMPARE) diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py index e810994cd4d4bea76bc506b2121016009f380ecc..c74ae0510331fb9ba8a1794bd724710ba19cfabf 100644 --- a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -53,10 +53,9 @@ class OverallSummaryAnalyzer(BaseAnalyzer): "Free Time": ['SDMA Time(Num)'] } - def __init__(self, collection_path: str, n_processes: int = 1, cann_version=const.DEFAULT_CANN_VERSION, - torch_version=const.DEFAULT_TORCH_VERSION, **kwargs): + def __init__(self, collection_path: str, n_processes: int = 1, **kwargs): profile_path = get_profile_path(collection_path) - super().__init__(profile_path, n_processes, cann_version, torch_version, **kwargs) + super().__init__(profile_path, n_processes, **kwargs) self.base_collection_path = kwargs.get("base_collection_path", "") self._has_base_collection = False self._is_minimal_profiling = False @@ -179,7 +178,7 @@ class OverallSummaryAnalyzer(BaseAnalyzer): self.cur_bottleneck["overall_data"] = overall_bottleneck if comparison_bottleneck: self.cur_bottleneck["comparison_result"] = comparison_bottleneck - def optimize(self): + def optimize(self, **kwargs): if self.path_check(): self.process() self.identify_bottleneck() diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py index 01613dbe328d513048eb2e1539ac4a19f0c5d587..c1eb24b8e1e11ac167a7eb9333867167a57dd524 100644 --- a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -19,9 +19,8 @@ logger = logging.getLogger() class TimelineFusionOpsAnalyzer(BaseAnalyzer): dataset_cls_list = [TimelineEventDataset] - def __init__(self, collection_path, n_processes: int = 1, cann_version=const.DEFAULT_CANN_VERSION, - torch_version=const.DEFAULT_TORCH_VERSION, **kwargs): - super().__init__(collection_path, n_processes, cann_version, torch_version, **kwargs) + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict() self.matched_op_stacks = {} self.empty_stacks = True diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py index 44f09d0a58b8ec5327846572af65f7161ee93e43..bfe49873a99ad1136b2c385457fb2644df992080 100644 --- a/profiler/advisor/common/analyzer_scopes.py +++ b/profiler/advisor/common/analyzer_scopes.py @@ -7,4 +7,7 @@ class SupportedScopes: SLOW_RANK = "slow_rank" SLOW_LINK = "slow_link" OVER_ALL = "over_all" - PROFILING_OPERATOR_ANALYSIS = "profiling_operator_analysis" + DYNAMIC_SHAPE_ANALYSIS = "dynamic_shape_analysis" + AICPU_ANALYSIS = "aicpu_analysis" + BLOCK_DIM_ANALYSIS = "block_dim_analysis" + OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" diff --git a/profiler/advisor/computation_analysis.ipynb b/profiler/advisor/computation_analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..bb9cb1112540a188a62c52a5a83a7c1d210b7571 --- /dev/null +++ b/profiler/advisor/computation_analysis.ipynb @@ -0,0 +1,770 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Block Dim问题识别\n", + "\n", + "Block Dim问题主要为识别相关core算子AI core核未打满或者Vector 核未打满问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Block Dim问题.\n", + "\n", + "下列代码为样例,主要展示如何检测Block Dim类型问题,并获取相关问题检测结果:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 查询computation相关是否存在block dim问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "block_dim_result = interface.get_result(\"computation\", \"block_dim_analysis\", cann_version=\"7.0.RC1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core;
Top-10 operator of task duration are as follows:
Square, MatMulV2, BatchMatMul,
SoftmaxV2, Mul, Transpose,
Assign, GatherV2, Sigmoid,
Cast
1. Optimize operator by AOE, such as:
'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg'
101814.01999999999991.0
" + ], + "text/plain": [ + "+-----------+--------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-----------+--------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; | 1. Optimize operator by AOE, such as: | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | Top-10 operator of task duration are as follows: | 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | | | | | |\n", + "| | Square, MatMulV2, BatchMatMul, | | | | | | |\n", + "| | SoftmaxV2, Mul, Transpose, | | | | | | |\n", + "| | Assign, GatherV2, Sigmoid, | | | | | | |\n", + "| | Cast | | | | | | |\n", + "+-----------+--------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = block_dim_result.get(\"problems\")\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(problems.get(\"headers\"))\n", + " for row in problems.get(\"data\"):\n", + " problem_table.add_row(row)\n", + " \n", + " problem_table.align = \"l\"\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to block dim.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_typetask_durationincomeblock_dimmix_block_diminput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formats
Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/attention_norm-
LlamaRMSNorm/Square-op34Default/model-
LlamaModel/layers-
CellList/0-LLamaDecodeLayer/attention_norm-
LlamaRMSNorm/ReduceMean-op35
SquareAI_VECTOR_CORE42.760160"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/ffn_norm-
LlamaRMSNorm/Square-op77Default/model-
LlamaModel/layers-
CellList/0-LLamaDecodeLayer/ffn_norm-
LlamaRMSNorm/ReduceMean-op78
SquareAI_VECTOR_CORE42.240160"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/lm_head-Linear/MatMul-op213MatMulV2AI_CORE39.020200"128,128;128,32000"FLOAT16;FLOAT16FORMAT_ND;FORMAT_ND"128,32000"FLOATFORMAT_ND
" + ], + "text/plain": [ + "+---------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| op_name | op_type | task_type | task_duration | income | block_dim | mix_block_dim | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats |\n", + "+---------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers- | Square | AI_VECTOR_CORE | 42.76 | 0 | 16 | 0 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| CellList/0-LLamaDecodeLayer/attention_norm- | | | | | | | | | | | | |\n", + "| LlamaRMSNorm/Square-op34Default/model- | | | | | | | | | | | | |\n", + "| LlamaModel/layers- | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/attention_norm- | | | | | | | | | | | | |\n", + "| LlamaRMSNorm/ReduceMean-op35 | | | | | | | | | | | | |\n", + "+---------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers- | Square | AI_VECTOR_CORE | 42.24 | 0 | 16 | 0 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| CellList/0-LLamaDecodeLayer/ffn_norm- | | | | | | | | | | | | |\n", + "| LlamaRMSNorm/Square-op77Default/model- | | | | | | | | | | | | |\n", + "| LlamaModel/layers- | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/ffn_norm- | | | | | | | | | | | | |\n", + "| LlamaRMSNorm/ReduceMean-op78 | | | | | | | | | | | | |\n", + "+---------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/lm_head-Linear/MatMul-op213 | MatMulV2 | AI_CORE | 39.02 | 0 | 20 | 0 | \"128,128;128,32000\" | FLOAT16;FLOAT16 | FORMAT_ND;FORMAT_ND | \"128,32000\" | FLOAT | FORMAT_ND |\n", + "+---------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " block_dim = block_dim_result.get(\"block dim\")\n", + " block_dim_table = PrettyTable(block_dim.get(\"headers\"))\n", + " for row in block_dim.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=50)\n", + " block_dim_table.add_row(row)\n", + "\n", + " block_dim_table.hrules = ALL\n", + " display(block_dim_table[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operator No Bound问题识别\n", + "Operator No Bound问题主要为识别相关算子无mte, cube, vector, scalar相关bound问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Operator No Bound问题.\n", + "\n", + "下列代码为样例,主要展示如何检测Operator No Bound类型问题,并获取相关问题检测结果:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface\n", + "\n", + "\n", + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# 查询computation相关是否存在operator no bound问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "operator_no_bound_result = interface.get_result(\"computation\", \"operator_no_bound_analysis\", cann_version=\"7.0.RC1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as
follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path
--tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg'
101814.01999999999991.0
operator no boundThere is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as
follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path
--tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg'
95814.01999999999990.7985
" + ], + "text/plain": [ + "+-------------------+----------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-------------------+----------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | | | | | |\n", + "| operator no bound | There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path | 95 | 814.0199999999999 | 0.7985 | | |\n", + "| | follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = operator_no_bound_result.get(\"problems\")\n", + "problem_table = PrettyTable(problems.get(\"headers\"))\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " for row in problems.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=120)\n", + " problem_table.add_row(row)\n", + "\n", + " problem_table.align = \"l\"\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to operator no bound.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_typetask_durationvec_ratiomac_ratioscalar_ratiomte1_ratiomte2_ratiomte3_ratioblock_diminput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formats
Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/attention_norm-
LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/attention_norm-
LlamaRMSNorm/ReduceMean-op35
SquareAI_VECTOR_CORE42.760.465400000.005616"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/Square-
op77Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/ffn_norm-
LlamaRMSNorm/ReduceMean-op78
SquareAI_VECTOR_CORE42.240.46600000.006216"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/lm_head-Linear/MatMul-op213MatMulV2AI_CORE39.0200.11050.01190.08570.4284020"128,128;128,32000"FLOAT16;FLOAT16FORMAT_ND;FORMAT_ND"128,32000"FLOATFORMAT_ND
" + ], + "text/plain": [ + "+-----------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| op_name | op_type | task_type | task_duration | vec_ratio | mac_ratio | scalar_ratio | mte1_ratio | mte2_ratio | mte3_ratio | block_dim | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats |\n", + "+-----------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers- | Square | AI_VECTOR_CORE | 42.76 | 0.4654 | 0 | 0 | 0 | 0 | 0.0056 | 16 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| CellList/0-LLamaDecodeLayer/attention_norm- | | | | | | | | | | | | | | | | |\n", + "| LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- | | | | | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/attention_norm- | | | | | | | | | | | | | | | | |\n", + "| LlamaRMSNorm/ReduceMean-op35 | | | | | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers- | Square | AI_VECTOR_CORE | 42.24 | 0.466 | 0 | 0 | 0 | 0 | 0.0062 | 16 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/Square- | | | | | | | | | | | | | | | | |\n", + "| op77Default/model-LlamaModel/layers- | | | | | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/ffn_norm- | | | | | | | | | | | | | | | | |\n", + "| LlamaRMSNorm/ReduceMean-op78 | | | | | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/lm_head-Linear/MatMul-op213 | MatMulV2 | AI_CORE | 39.02 | 0 | 0.1105 | 0.0119 | 0.0857 | 0.4284 | 0 | 20 | \"128,128;128,32000\" | FLOAT16;FLOAT16 | FORMAT_ND;FORMAT_ND | \"128,32000\" | FLOAT | FORMAT_ND |\n", + "+-----------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " operator_no_bound = operator_no_bound_result.get(\"operator no bound\")\n", + " operator_no_bound_table = PrettyTable(operator_no_bound.get(\"headers\"))\n", + " for row in operator_no_bound.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=60)\n", + " operator_no_bound_table.add_row(row)\n", + " operator_no_bound_table.hrules = ALL\n", + " display(operator_no_bound_table[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AICPU问题识别\n", + "AICPU问题主要为识别相关算子执行时跑到AICPU上计算,并没有利用到AI CORE的计算能力的场景,主要调优手段为修改相关代码来避免AICPU算子,可参见相关资料,来避免AICPU算子的问题:\n", + "https://support.huaweicloud.com/bestpractice-modelarts/modelarts_10_2517.html\n", + "\n", + "下列代码为样例,主要展示如何检测Dynamic Shape类型问题,并获取相关问题检测结果:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface\n", + "\n", + "\n", + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Please ensure only one trace_view.json in C:\\personalC\\profiling_data, there will analyze first timeline profiling data.\n", + " \r" + ] + } + ], + "source": [ + "# 查询computation相关是否存在aicpu问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "aicpu_result = interface.get_result(\"computation\", \"aicpu_analysis\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as
follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path
--tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg'
101814.01999999999991.0
operator no boundThere is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as
follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path
--tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg'
95814.01999999999990.7985
AICPU operatorSome operators and task duration exceed 20 us, such as :
Cast
1. Modify code to avoid aicpu operator39686568.8600000010.0189
" + ], + "text/plain": [ + "+-------------------+----------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-------------------+----------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | | | | | |\n", + "| operator no bound | There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path | 95 | 814.0199999999999 | 0.7985 | | |\n", + "| | follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advisor\\operator_tuning_file_20240607114012.cfg' | | | | | |\n", + "| AICPU operator | Some operators and task duration exceed 20 us, such as : | 1. Modify code to avoid aicpu operator | 39 | 686568.860000001 | 0.0189 | | |\n", + "| | Cast | | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = aicpu_result.get(\"problems\")\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(problems.get(\"headers\"))\n", + " for row in problems.get(\"data\"):\n", + " problem_table.add_row(row)\n", + "\n", + " problem_table.align = \"l\"\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to operator no bound.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_durationinput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formatsstack_info
trans_Cast_5Cast493.64""INT32FORMAT_ND""UINT64FORMAT_ND/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/functional.py(1279): dropout;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/dropout.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/l
anguage_model.py(236): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/l
anguage_model.py(425): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/g
pt_model.py(84): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/m
odule.py(184): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/d
istributed.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; ../../pretrain_gpt.py(88):
forward_step;
/profiling_auto_GPT3/megatron/schedules.py(118):
forward_step; /home/s30040711/Megatron-LM/megatro
n_npu_adaptor/megatron_npu/adaptor_schedules.py(96
): forward_backward_no_pipelining;
/profiling_auto_GPT3/megatron/training.py(419):
train_step;
/profiling_auto_GPT3/megatron/training.py(837):
train;
/profiling_auto_GPT3/megatron/training.py(152):
pretrain; ../../pretrain_gpt.py(122): <module>
trans_Cast_5Cast413.4""INT32FORMAT_ND""UINT64FORMAT_ND/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/functional.py(1279): dropout;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/dropout.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/l
anguage_model.py(236): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/l
anguage_model.py(425): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/g
pt_model.py(84): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/m
odule.py(184): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; /profiling_auto_GPT3/megatron/model/d
istributed.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110):
_call_impl; ../../pretrain_gpt.py(88):
forward_step;
/profiling_auto_GPT3/megatron/schedules.py(118):
forward_step; /home/s30040711/Megatron-LM/megatro
n_npu_adaptor/megatron_npu/adaptor_schedules.py(10
9): forward_backward_no_pipelining;
/profiling_auto_GPT3/megatron/training.py(419):
train_step;
/profiling_auto_GPT3/megatron/training.py(837):
train;
/profiling_auto_GPT3/megatron/training.py(152):
pretrain; ../../pretrain_gpt.py(122): <module>
" + ], + "text/plain": [ + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------+\n", + "| op_name | op_type | task_duration | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats | stack_info |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------+\n", + "| trans_Cast_5 | Cast | 493.64 | \"\" | INT32 | FORMAT_ND | \"\" | UINT64 | FORMAT_ND | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/functional.py(1279): dropout; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/dropout.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/l |\n", + "| | | | | | | | | | anguage_model.py(236): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/l |\n", + "| | | | | | | | | | anguage_model.py(425): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/g |\n", + "| | | | | | | | | | pt_model.py(84): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/m |\n", + "| | | | | | | | | | odule.py(184): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/d |\n", + "| | | | | | | | | | istributed.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; ../../pretrain_gpt.py(88): |\n", + "| | | | | | | | | | forward_step; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/schedules.py(118): |\n", + "| | | | | | | | | | forward_step; /home/s30040711/Megatron-LM/megatro |\n", + "| | | | | | | | | | n_npu_adaptor/megatron_npu/adaptor_schedules.py(96 |\n", + "| | | | | | | | | | ): forward_backward_no_pipelining; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(419): |\n", + "| | | | | | | | | | train_step; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(837): |\n", + "| | | | | | | | | | train; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(152): |\n", + "| | | | | | | | | | pretrain; ../../pretrain_gpt.py(122): |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------+\n", + "| trans_Cast_5 | Cast | 413.4 | \"\" | INT32 | FORMAT_ND | \"\" | UINT64 | FORMAT_ND | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/functional.py(1279): dropout; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/dropout.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/l |\n", + "| | | | | | | | | | anguage_model.py(236): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/l |\n", + "| | | | | | | | | | anguage_model.py(425): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/g |\n", + "| | | | | | | | | | pt_model.py(84): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/m |\n", + "| | | | | | | | | | odule.py(184): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; /profiling_auto_GPT3/megatron/model/d |\n", + "| | | | | | | | | | istributed.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): |\n", + "| | | | | | | | | | _call_impl; ../../pretrain_gpt.py(88): |\n", + "| | | | | | | | | | forward_step; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/schedules.py(118): |\n", + "| | | | | | | | | | forward_step; /home/s30040711/Megatron-LM/megatro |\n", + "| | | | | | | | | | n_npu_adaptor/megatron_npu/adaptor_schedules.py(10 |\n", + "| | | | | | | | | | 9): forward_backward_no_pipelining; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(419): |\n", + "| | | | | | | | | | train_step; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(837): |\n", + "| | | | | | | | | | train; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(152): |\n", + "| | | | | | | | | | pretrain; ../../pretrain_gpt.py(122): |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " aicpu = aicpu_result.get(\"AICPU operator\")\n", + " aicpu_table = PrettyTable(aicpu.get(\"headers\"))\n", + " for row in aicpu.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=50)\n", + " aicpu_table.add_row(row)\n", + " aicpu_table.hrules = ALL\n", + " display(aicpu_table[:2])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/advisor/dataset/cluster/cluster_dataset.py b/profiler/advisor/dataset/cluster/cluster_dataset.py index 466349afd7378cb980a37e6a391a486fc0e9bc13..09fda2d4dcf2df2f05abb0007befb5c5c36ef824 100644 --- a/profiler/advisor/dataset/cluster/cluster_dataset.py +++ b/profiler/advisor/dataset/cluster/cluster_dataset.py @@ -65,7 +65,7 @@ class ClusterDataset(Dataset): class ClusterStepTraceTimeDataSet(ClusterDataset): RANK = "rank" - def __init__(self, collection_path: str, data: dict, kwargs: dict = None): + def __init__(self, collection_path: str, data: dict, **kwargs): self._step_dict = defaultdict() super().__init__(collection_path, data) @@ -107,7 +107,7 @@ class ClusterCommunicationDataSet(ClusterDataset): SDMA = "SDMA" RDMA = "RDMA" - def __init__(self, collection_path: str, data: dict, kwargs: dict = None): + def __init__(self, collection_path: str, data: dict, **kwargs): self.rank_bw_dict = defaultdict(lambda: { self.RDMA_TIME_MS: 0, self.RDMA_SIZE_MB: 0, diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py index ca47227692c3ba2cb1b7956ed119ab62b81ba236..dc4453a7bd75ad29d4ed371c83dd1c50c9bfe874 100644 --- a/profiler/advisor/dataset/timeline_event_dataset.py +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -78,7 +78,7 @@ class TimelineEventDataset(Dataset): return False if len(self.timeline_data_list) > 1: - logger.warning("Please ensure only one trace_view.json in %s, there will analysis first timeline profiling data.", self.timeline_dir) + logger.warning("Please ensure only one trace_view.json in %s, there will analyze first timeline profiling data.", self.timeline_dir) self.timeline_data_list = [self.timeline_data_list[0]] result = self.parse_data_with_generator(self._add_event) diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py index 5b4b17ae4a742cbe087bdba9a98b22077fbf17e0..8ea7c9e0fc22c7da71a673e399fcfc231fbf1453 100644 --- a/profiler/advisor/display/html/render.py +++ b/profiler/advisor/display/html/render.py @@ -5,6 +5,7 @@ from typing import List, Dict from jinja2 import Environment, FileSystemLoader from profiler.advisor.common import constant +from profiler.advisor.config.config import Config from profiler.advisor.utils.utils import singleton, safe_write logger = logging.getLogger() @@ -41,4 +42,4 @@ class HTMLRender: return safe_write(self.html, save_path) - logger.info("Save suggestion to %s.", save_path) + logger.info("Save suggestion to %s.", os.path.join(Config().work_path, save_path)) diff --git a/profiler/advisor/interface/interface.py b/profiler/advisor/interface/interface.py index f9b9fb467ec5f74589c3f381bfd2876587e36b8e..1e0f0e9a563b8512f5db3a7b812a4f05a9c343f0 100644 --- a/profiler/advisor/interface/interface.py +++ b/profiler/advisor/interface/interface.py @@ -5,7 +5,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.dirname( sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "compare_tools")) from profiler.advisor.utils.utils import Timer -from profiler.advisor.analyzer.computation.profiling_analyzer import ProfilingAnalyzer +from profiler.advisor.analyzer.computation.profiling_analyzer import AicpuAnalyzer, BlockDimAnalyzer, DynamicShapeAnalyzer, OperatorBoundAnalyzer from profiler.advisor.analyzer.schedule.fusion_ops.fusion_ops_analyzer import TimelineFusionOpsAnalyzer from profiler.advisor.analyzer.graph_fusion.graph_fusion_analyzer import FusionOPAnalyzer from profiler.advisor.common.analyzer_scopes import SupportedScopes @@ -19,7 +19,10 @@ class Interface: SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer }), "computation": OrderedDict({ - SupportedScopes.PROFILING_OPERATOR_ANALYSIS: ProfilingAnalyzer, + SupportedScopes.DYNAMIC_SHAPE_ANALYSIS: DynamicShapeAnalyzer, + SupportedScopes.AICPU_ANALYSIS: AicpuAnalyzer, + SupportedScopes.OPERATOR_NO_BOUND_ANALYSIS: OperatorBoundAnalyzer, + SupportedScopes.BLOCK_DIM_ANALYSIS: BlockDimAnalyzer, SupportedScopes.GRAPH: FusionOPAnalyzer }), "communication": OrderedDict(), diff --git a/profiler/advisor/result/result.py b/profiler/advisor/result/result.py index 237317051984f0afe13ceb5b6d06bf348934451b..ebee4f2cc3e687d506422ee05d7d4c4614c91b9d 100644 --- a/profiler/advisor/result/result.py +++ b/profiler/advisor/result/result.py @@ -139,6 +139,7 @@ class OptimizeResult: for sheet_name, sheet_data in self.sheet_recorder.sheet_data.items(): self.result_writer.add_data(sheet_name, sheet_data.get("headers"), sheet_data.get("data")) self.result_writer.save() + logger.info("Save problems details file to %s", Config().analysis_result_file) self._save_op_file_list() TerminalResult().print() diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py index b40da6af3036b19577cdac45ecfd7fe9759b591c..7bd7f1722517edc2e8177d3b88af06a6217cf5f2 100644 --- a/profiler/cli/analyze_cli.py +++ b/profiler/cli/analyze_cli.py @@ -42,7 +42,7 @@ def _analyze(dimensions, **kwargs): for i, (dimension, scope, interface) in enumerate(job_list[::-1]): result_list.append( - interface.get_result(dimension, scope, render_html=i == len(job_list) - 1, output_dict=False)) + interface.get_result(dimension, scope, render_html=i == len(job_list) - 1, output_dict=False, **kwargs)) for result in result_list[::-1]: if result and hasattr(result, "show"): diff --git a/profiler/cli/entrance.py b/profiler/cli/entrance.py index a260553031ecfc904ae8411d944037bdb2f101ab..f668abe651d78bfa3338ac4529189d5354f5c529 100644 --- a/profiler/cli/entrance.py +++ b/profiler/cli/entrance.py @@ -65,3 +65,11 @@ msprof_analyze_cli.add_command(analyze_cli, name="advisor") msprof_analyze_cli.add_command(compare_cli, name="compare") msprof_analyze_cli.add_command(cluster_cli, name="cluster") msprof_analyze_cli.add_command(auto_complete_cli, name="auto-completion") + +if __name__ == '__main__': + msprof_analyze_cli.main( + [ + "advisor", "all", "-d", + r"C:\personalC\profiling_data" + ] + ) \ No newline at end of file