From 7786f73aa8c95dbe3054bad5d982365be8e8e0b6 Mon Sep 17 00:00:00 2001
From: stby <295887736@qq.com>
Date: Tue, 18 Jun 2024 10:02:12 +0800
Subject: [PATCH 1/7] review advisor 330~630
---
profiler/advisor_review/README.md | 80 ++
profiler/advisor_review/__init__.py | 17 +
.../advisor_backend/__init__.py | 14 +
.../advisor_backend/advice_base.py | 50 +
.../advice_factory/__init__.py | 14 +
.../advice_factory/advice_factory.py | 50 +
.../advice_factory/cluster_advice_factory.py | 38 +
.../advice_factory/compute_advice_factory.py | 34 +
.../advice_factory/overall_advice_factory.py | 32 +
.../advice_factory/timeline_advice_factory.py | 34 +
.../cluster_advice/__init__.py | 14 +
.../cluster_advice/cluster_advice_base.py | 67 ++
.../cluster_advice/cluster_pipeline_advice.py | 437 +++++++
.../cluster_advice/kernel_cluster_advice.py | 62 +
.../cluster_advice/slow_link_advice.py | 110 ++
.../cluster_advice/slow_rank_advice.py | 71 ++
.../common_func_advisor/__init__.py | 14 +
.../common_func_advisor/constant.py | 225 ++++
.../common_func_advisor/trace_view_json.py | 209 ++++
.../trace_view_preprocessor.py | 208 ++++
.../compute_advice/__init__.py | 14 +
.../compute_advice/compute_advice_base.py | 105 ++
.../compute_advice/npu_fused/__init__.py | 14 +
.../compute_advice/npu_fused/csv_analyzer.py | 81 ++
.../compute_advice/npu_fused/json_analyzer.py | 55 +
.../compute_advice/npu_fused/op_perf.py | 196 ++++
.../compute_advice/npu_fused_advice.py | 71 ++
.../compute_advice/npu_slow_advice.py | 82 ++
.../advisor_backend/interface.py | 62 +
.../overall_advice/__init__.py | 0
.../overall_advice/overall_summary_advice.py | 176 +++
.../prof_bean_advisor/__init__.py | 14 +
.../cluster_step_trace_time_bean.py | 67 ++
.../timeline_advice/__init__.py | 14 +
.../timeline_advice/op_schedule_advice.py | 89 ++
.../timeline_advice/optimizer_advice.py | 55 +
.../timeline_advice/timeline_advice_base.py | 99 ++
profiler/advisor_review/analyzer/__init__.py | 0
.../advisor_review/analyzer/base_analyzer.py | 94 ++
.../analyzer/cluster/__init__.py | 0
.../analyzer/cluster/slow_link_analyser.py | 122 ++
.../analyzer/cluster/slow_rank_analyser.py | 112 ++
.../analyzer/communication/__init__.py | 0
.../communication/bandwidth/__init__.py | 0
.../communication/environment/__init__.py | 0
.../analyzer/computation/__init__.py | 0
.../analyzer/computation/aicpu/__init__.py | 0
.../computation/aicpu/aicpu_checker.py | 278 +++++
.../analyzer/computation/bound/__init__.py | 0
.../computation/bound/block_dim_checker.py | 75 ++
.../bound/operator_bound_checker.py | 53 +
.../computation/op_compile/__init__.py | 0
.../op_compile/dynamic_shape_checker.py | 65 +
.../analyzer/computation/operator_checker.py | 307 +++++
.../computation/profiling_analyzer.py | 89 ++
.../analyzer/dataloader/__init__.py | 0
.../analyzer/graph_fusion/__init__.py | 0
.../graph_fusion/graph_fusion_analyzer.py | 49 +
.../graph_fusion/graph_fusion_checker.py | 207 ++++
.../analyzer/overall/__init__.py | 0
.../analyzer/overall/overall_analyzer.py | 45 +
.../overall/overall_summary_analyzer.py | 262 +++++
.../analyzer/schedule/__init__.py | 0
.../analyzer/schedule/dispatch/__init__.py | 0
.../dispatch/timeline_op_dispatch_analyzer.py | 107 ++
.../analyzer/schedule/free_event/__init__.py | 0
.../analyzer/schedule/fusion_ops/__init__.py | 0
.../fusion_ops/fusion_ops_analyzer.py | 271 +++++
.../fusion_ops/timeline_api_stack_checker.py | 163 +++
.../cluster_perf_analysis.ipynb | 1042 +++++++++++++++++
profiler/advisor_review/common/__init__.py | 0
.../advisor_review/common/analyzer_scopes.py | 14 +
profiler/advisor_review/common/constant.py | 140 +++
.../advisor_review/common/graph/__init__.py | 0
profiler/advisor_review/common/graph/graph.py | 135 +++
.../common/graph/graph_match.py | 355 ++++++
.../common/graph/graph_parser.py | 413 +++++++
.../common/profiling/__init__.py | 0
.../common/profiling/ge_info.py | 47 +
.../advisor_review/common/profiling/msprof.py | 144 +++
.../common/profiling/op_summary.py | 76 ++
.../common/profiling/tasktime.py | 75 ++
.../common/timeline/__init__.py | 0
.../advisor_review/common/timeline/event.py | 23 +
.../common/timeline/fusion_ops_db.py | 269 +++++
.../common/timeline/fusion_ops_rule.py | 110 ++
.../timeline/fusion_ops_rule_handler.py | 193 +++
.../advisor_review/common/version_control.py | 26 +
.../advisor_review/computation_analysis.ipynb | 748 ++++++++++++
profiler/advisor_review/config/__init__.py | 0
profiler/advisor_review/config/config.ini | 16 +
profiler/advisor_review/config/config.py | 108 ++
.../config/profiling_data_version_config.yaml | 80 ++
profiler/advisor_review/dataset/__init__.py | 0
.../dataset/cluster/__init__.py | 0
.../dataset/cluster/cluster_dataset.py | 165 +++
.../cluster/cluster_step_trace_time_bean.py | 67 ++
profiler/advisor_review/dataset/dataset.py | 38 +
.../advisor_review/dataset/graph_dataset.py | 53 +
.../dataset/profiling/__init__.py | 0
.../dataset/profiling/builder_base.py | 39 +
.../dataset/profiling/db_manager.py | 70 ++
.../dataset/profiling/device_info.py | 61 +
.../dataset/profiling/info_collection.py | 270 +++++
.../dataset/profiling/profiling_dataset.py | 79 ++
.../dataset/profiling/profiling_parser.py | 132 +++
.../dataset/timeline_event_dataset.py | 220 ++++
profiler/advisor_review/display/__init__.py | 0
.../advisor_review/display/html/__init__.py | 0
.../advisor_review/display/html/render.py | 45 +
.../display/html/templates/__init__.py | 0
.../display/html/templates/affinity_api.html | 50 +
.../html/templates/cluster_analysis.html | 49 +
.../html/templates/compute_analysis.html | 29 +
.../display/html/templates/fusion.html | 47 +
.../display/html/templates/main.html | 203 ++++
.../html/templates/operator_ai_cpu.html | 61 +
.../html/templates/operator_block_dim.html | 38 +
.../html/templates/operator_dispatch.html | 37 +
.../templates/operator_dynamic_shape.html | 15 +
.../html/templates/operator_no_bound.html | 38 +
.../html/templates/overall_analysis.html | 15 +
.../html/templates/timeline_analysis.html | 34 +
.../fusion_operators_api_analysis.ipynb | 211 ++++
.../advisor_review/img/advisor_result.PNG | Bin 0 -> 53557 bytes
.../advisor_review/img/jupyter_report.PNG | Bin 0 -> 34097 bytes
profiler/advisor_review/interface/__init__.py | 0
.../advisor_review/interface/interface.py | 75 ++
profiler/advisor_review/result/__init__.py | 0
profiler/advisor_review/result/item.py | 61 +
profiler/advisor_review/result/result.py | 210 ++++
profiler/advisor_review/rules/__init__.py | 0
.../advisor_review/rules/aicpu_rules.yaml | 103 ++
.../advisor_review/rules/op_fusion_pass.yaml | 491 ++++++++
.../rules/timeline_fusion_ops.yaml | 59 +
profiler/advisor_review/utils/__init__.py | 0
profiler/advisor_review/utils/log.py | 63 +
profiler/advisor_review/utils/tools.py | 76 ++
profiler/advisor_review/utils/utils.py | 552 +++++++++
profiler/advisor_review/version.py | 38 +
140 files changed, 13081 insertions(+)
create mode 100644 profiler/advisor_review/README.md
create mode 100644 profiler/advisor_review/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/advice_base.py
create mode 100644 profiler/advisor_review/advisor_backend/advice_factory/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py
create mode 100644 profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py
create mode 100644 profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py
create mode 100644 profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py
create mode 100644 profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py
create mode 100644 profiler/advisor_review/advisor_backend/cluster_advice/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py
create mode 100644 profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/common_func_advisor/constant.py
create mode 100644 profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py
create mode 100644 profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py
create mode 100644 profiler/advisor_review/advisor_backend/compute_advice/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py
create mode 100644 profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py
create mode 100644 profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py
create mode 100644 profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py
create mode 100644 profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/interface.py
create mode 100644 profiler/advisor_review/advisor_backend/overall_advice/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py
create mode 100644 profiler/advisor_review/advisor_backend/timeline_advice/__init__.py
create mode 100644 profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py
create mode 100644 profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py
create mode 100644 profiler/advisor_review/analyzer/__init__.py
create mode 100644 profiler/advisor_review/analyzer/base_analyzer.py
create mode 100644 profiler/advisor_review/analyzer/cluster/__init__.py
create mode 100644 profiler/advisor_review/analyzer/cluster/slow_link_analyser.py
create mode 100644 profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py
create mode 100644 profiler/advisor_review/analyzer/communication/__init__.py
create mode 100644 profiler/advisor_review/analyzer/communication/bandwidth/__init__.py
create mode 100644 profiler/advisor_review/analyzer/communication/environment/__init__.py
create mode 100644 profiler/advisor_review/analyzer/computation/__init__.py
create mode 100644 profiler/advisor_review/analyzer/computation/aicpu/__init__.py
create mode 100644 profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py
create mode 100644 profiler/advisor_review/analyzer/computation/bound/__init__.py
create mode 100644 profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py
create mode 100644 profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py
create mode 100644 profiler/advisor_review/analyzer/computation/op_compile/__init__.py
create mode 100644 profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py
create mode 100644 profiler/advisor_review/analyzer/computation/operator_checker.py
create mode 100644 profiler/advisor_review/analyzer/computation/profiling_analyzer.py
create mode 100644 profiler/advisor_review/analyzer/dataloader/__init__.py
create mode 100644 profiler/advisor_review/analyzer/graph_fusion/__init__.py
create mode 100644 profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py
create mode 100644 profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py
create mode 100644 profiler/advisor_review/analyzer/overall/__init__.py
create mode 100644 profiler/advisor_review/analyzer/overall/overall_analyzer.py
create mode 100644 profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py
create mode 100644 profiler/advisor_review/analyzer/schedule/__init__.py
create mode 100644 profiler/advisor_review/analyzer/schedule/dispatch/__init__.py
create mode 100644 profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py
create mode 100644 profiler/advisor_review/analyzer/schedule/free_event/__init__.py
create mode 100644 profiler/advisor_review/analyzer/schedule/fusion_ops/__init__.py
create mode 100644 profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py
create mode 100644 profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py
create mode 100644 profiler/advisor_review/cluster_perf_analysis.ipynb
create mode 100644 profiler/advisor_review/common/__init__.py
create mode 100644 profiler/advisor_review/common/analyzer_scopes.py
create mode 100644 profiler/advisor_review/common/constant.py
create mode 100644 profiler/advisor_review/common/graph/__init__.py
create mode 100644 profiler/advisor_review/common/graph/graph.py
create mode 100644 profiler/advisor_review/common/graph/graph_match.py
create mode 100644 profiler/advisor_review/common/graph/graph_parser.py
create mode 100644 profiler/advisor_review/common/profiling/__init__.py
create mode 100644 profiler/advisor_review/common/profiling/ge_info.py
create mode 100644 profiler/advisor_review/common/profiling/msprof.py
create mode 100644 profiler/advisor_review/common/profiling/op_summary.py
create mode 100644 profiler/advisor_review/common/profiling/tasktime.py
create mode 100644 profiler/advisor_review/common/timeline/__init__.py
create mode 100644 profiler/advisor_review/common/timeline/event.py
create mode 100644 profiler/advisor_review/common/timeline/fusion_ops_db.py
create mode 100644 profiler/advisor_review/common/timeline/fusion_ops_rule.py
create mode 100644 profiler/advisor_review/common/timeline/fusion_ops_rule_handler.py
create mode 100644 profiler/advisor_review/common/version_control.py
create mode 100644 profiler/advisor_review/computation_analysis.ipynb
create mode 100644 profiler/advisor_review/config/__init__.py
create mode 100644 profiler/advisor_review/config/config.ini
create mode 100644 profiler/advisor_review/config/config.py
create mode 100644 profiler/advisor_review/config/profiling_data_version_config.yaml
create mode 100644 profiler/advisor_review/dataset/__init__.py
create mode 100644 profiler/advisor_review/dataset/cluster/__init__.py
create mode 100644 profiler/advisor_review/dataset/cluster/cluster_dataset.py
create mode 100644 profiler/advisor_review/dataset/cluster/cluster_step_trace_time_bean.py
create mode 100644 profiler/advisor_review/dataset/dataset.py
create mode 100644 profiler/advisor_review/dataset/graph_dataset.py
create mode 100644 profiler/advisor_review/dataset/profiling/__init__.py
create mode 100644 profiler/advisor_review/dataset/profiling/builder_base.py
create mode 100644 profiler/advisor_review/dataset/profiling/db_manager.py
create mode 100644 profiler/advisor_review/dataset/profiling/device_info.py
create mode 100644 profiler/advisor_review/dataset/profiling/info_collection.py
create mode 100644 profiler/advisor_review/dataset/profiling/profiling_dataset.py
create mode 100644 profiler/advisor_review/dataset/profiling/profiling_parser.py
create mode 100644 profiler/advisor_review/dataset/timeline_event_dataset.py
create mode 100644 profiler/advisor_review/display/__init__.py
create mode 100644 profiler/advisor_review/display/html/__init__.py
create mode 100644 profiler/advisor_review/display/html/render.py
create mode 100644 profiler/advisor_review/display/html/templates/__init__.py
create mode 100644 profiler/advisor_review/display/html/templates/affinity_api.html
create mode 100644 profiler/advisor_review/display/html/templates/cluster_analysis.html
create mode 100644 profiler/advisor_review/display/html/templates/compute_analysis.html
create mode 100644 profiler/advisor_review/display/html/templates/fusion.html
create mode 100644 profiler/advisor_review/display/html/templates/main.html
create mode 100644 profiler/advisor_review/display/html/templates/operator_ai_cpu.html
create mode 100644 profiler/advisor_review/display/html/templates/operator_block_dim.html
create mode 100644 profiler/advisor_review/display/html/templates/operator_dispatch.html
create mode 100644 profiler/advisor_review/display/html/templates/operator_dynamic_shape.html
create mode 100644 profiler/advisor_review/display/html/templates/operator_no_bound.html
create mode 100644 profiler/advisor_review/display/html/templates/overall_analysis.html
create mode 100644 profiler/advisor_review/display/html/templates/timeline_analysis.html
create mode 100644 profiler/advisor_review/fusion_operators_api_analysis.ipynb
create mode 100644 profiler/advisor_review/img/advisor_result.PNG
create mode 100644 profiler/advisor_review/img/jupyter_report.PNG
create mode 100644 profiler/advisor_review/interface/__init__.py
create mode 100644 profiler/advisor_review/interface/interface.py
create mode 100644 profiler/advisor_review/result/__init__.py
create mode 100644 profiler/advisor_review/result/item.py
create mode 100644 profiler/advisor_review/result/result.py
create mode 100644 profiler/advisor_review/rules/__init__.py
create mode 100644 profiler/advisor_review/rules/aicpu_rules.yaml
create mode 100644 profiler/advisor_review/rules/op_fusion_pass.yaml
create mode 100644 profiler/advisor_review/rules/timeline_fusion_ops.yaml
create mode 100644 profiler/advisor_review/utils/__init__.py
create mode 100644 profiler/advisor_review/utils/log.py
create mode 100644 profiler/advisor_review/utils/tools.py
create mode 100644 profiler/advisor_review/utils/utils.py
create mode 100644 profiler/advisor_review/version.py
diff --git a/profiler/advisor_review/README.md b/profiler/advisor_review/README.md
new file mode 100644
index 00000000000..283aa294388
--- /dev/null
+++ b/profiler/advisor_review/README.md
@@ -0,0 +1,80 @@
+# advisor
+
+msprof-analyze的advisor功能是将Ascend PyTorch Profiler或者msprof采集的PyThon场景性能数据进行分析,并输出性能调优建议(当前暂不支持对db格式文件分析)。
+
+## 工具使用(命令行方式方式)
+
+1. 参见《[性能工具](../README.md)》完成工具安装。建议安装最新版本。
+
+2. 执行分析。
+
+ - 总体性能瓶颈
+
+ ```bash
+ msprof-analyze advisor all -d [待分析性能数据文件所在路径] -bp [基准性能数据文件所在路径]
+ ```
+
+ - 计算瓶颈
+
+ ```bash
+ msprof-analyze advisor computation -d [待分析性能数据文件所在路径]
+ ```
+
+ - 调度瓶颈
+
+ ```bash
+ msprof-analyze advisor schedule -d [待分析性能数据文件所在路径]
+ ```
+
+
+ -d(必选):待分析性能数据文件所在路径。
+
+ -bp(可选):基准性能数据文件所在路径。
+
+ 单卡场景需要指定到性能数据文件`*_ascend_pt`目录;多卡或集群场景需要指定到`*_ascend_pt`目录的父目录层级。
+
+3. 查看结果。
+
+ 分析结果打屏展示并生成html和csv文件。
+
+## 工具使用(Jupyter Notebook方式)
+
+Jupyter Notebook使用方式如下:
+
+下列以Windows环境下执行为例介绍。
+
+1. 在环境下安装Jupyter Notebook工具。
+
+ ```bash
+ pip install jupyter notebook
+ ```
+
+ Jupyter Notebook工具的具体安装和使用指导请至Jupyter Notebook工具官网查找。
+
+2. 在环境下安装ATT工具。
+
+ ```
+ git clone https://gitee.com/ascend/att.git
+ ```
+
+ 安装环境下保存Ascend PyTorch Profiler采集的性能数据。
+
+3. 进入att\profiler\advisor目录执行如下命令启动Jupyter Notebook工具。
+
+ ```bash
+ jupyter notebook
+ ```
+
+ 执行成功则自动启动浏览器读取att\profiler\advisor目录,如下示例:
+
+ 
+
+ 若在Linux环境下则回显打印URL地址,即是打开Jupyter Notebook工具页面的地址,需要复制URL,并使用浏览器访问(若为远端服务器则需要将域名“**localhost**”替换为远端服务器的IP),进入Jupyter Notebook工具页面。
+
+4. 每个.ipynb文件为一项性能数据分析任务,选择需要的.ipynb打开,并在*_path参数下拷贝保存Ascend PyTorch Profiler采集的性能数据的路径。如下示例:
+
+ 
+
+5. 单击运行按钮执行性能数据分析。
+
+ 分析结果详细内容会在.ipynb页面下展示。
diff --git a/profiler/advisor_review/__init__.py b/profiler/advisor_review/__init__.py
new file mode 100644
index 00000000000..e79018ed05c
--- /dev/null
+++ b/profiler/advisor_review/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from profiler.advisor.interface.interface import Interface
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/__init__.py b/profiler/advisor_review/advisor_backend/__init__.py
new file mode 100644
index 00000000000..a0e9f748f4b
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/advice_base.py b/profiler/advisor_review/advisor_backend/advice_base.py
new file mode 100644
index 00000000000..35939bcea9c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_base.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import abstractmethod
+
+
+class AdviceBase:
+ DATA = "data"
+ BOTTLENECK = "bottleneck"
+ ADVICE = "advice"
+
+ def __init__(self, collection_path: str):
+ self.collection_path = os.path.realpath(collection_path)
+ self.bottelneck = ''
+ self.output_format_data = {
+ self.DATA: [],
+ self.BOTTLENECK: '',
+ self.ADVICE: ''
+ }
+
+ @abstractmethod
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+
+ @abstractmethod
+ def run(self):
+ """
+ analyze profiling data and advice
+ """
+
+ @abstractmethod
+ def output(self):
+ """
+ output relevant data
+ """
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/__init__.py b/profiler/advisor_review/advisor_backend/advice_factory/__init__.py
new file mode 100644
index 00000000000..a0e9f748f4b
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py
new file mode 100644
index 00000000000..639f4800cfe
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from common_func.path_manager import PathManager
+
+
+class AdviceFactory:
+ def __init__(self, collection_path: str):
+ self.collection_path = os.path.realpath(collection_path)
+
+ @staticmethod
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+
+ def produce_advice(self, advice: str, kwargs: dict):
+ """
+ produce data for input mode and advice
+ """
+ self.path_check()
+ self.advice_check(advice)
+ return self.run_advice(advice, kwargs)
+
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+ PathManager.input_path_common_check(self.collection_path)
+
+ def advice_check(self, advice: str):
+ """
+ check whether input advice is valid
+ """
+ if advice not in self.ADVICE_LIB.keys():
+ msg = '[ERROR]Input advice is illegal.'
+ raise RuntimeError(msg)
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py
new file mode 100644
index 00000000000..6bb93f46704
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from advice_factory.advice_factory import AdviceFactory
+from cluster_advice.slow_link_advice import SlowLinkAdvice
+from cluster_advice.slow_rank_advice import SlowRankAdvice
+from cluster_advice.cluster_pipeline_advice import ClusterPipelineAdvice
+from cluster_advice.kernel_cluster_advice import KernelClusterAdvice
+from common_func_advisor.constant import Constant
+
+
+class ClusterAdviceFactory(AdviceFactory):
+ ADVICE_LIB = {
+ Constant.SLOW_RANK: SlowRankAdvice,
+ Constant.SLOW_LINK: SlowLinkAdvice,
+ Constant.PIPELINE: ClusterPipelineAdvice,
+ Constant.KERNEL: KernelClusterAdvice
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+ return self.ADVICE_LIB.get(advice)(self.collection_path, kwargs).run()
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py
new file mode 100644
index 00000000000..336bef7dd85
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from common_func_advisor.constant import Constant
+from advice_factory.advice_factory import AdviceFactory
+from compute_advice.npu_fused_advice import NpuFusedAdvice
+from compute_advice.npu_slow_advice import NpuSlowAdvice
+
+
+class ComputeAdviceFactory(AdviceFactory):
+ ADVICE_LIB = {
+ Constant.NPU_FUSED: NpuFusedAdvice,
+ Constant.NPU_SLOW: NpuSlowAdvice,
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+ return self.ADVICE_LIB.get(advice)(self.collection_path).run()
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py
new file mode 100644
index 00000000000..baf80cc200f
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from advice_factory.advice_factory import AdviceFactory
+from common_func_advisor.constant import Constant
+from overall_advice.overall_summary_advice import OverallSummaryAdvice
+
+
+class OverallAdviceFactory(AdviceFactory):
+ ADVICE_LIB = {
+ Constant.SUMMARY: OverallSummaryAdvice
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+ return self.ADVICE_LIB.get(advice)(self.collection_path, kwargs).run()
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py
new file mode 100644
index 00000000000..44b352e95a7
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from advice_factory.advice_factory import AdviceFactory
+from common_func_advisor.constant import Constant
+from timeline_advice.optimizer_advice import OptimizerAdvice
+from timeline_advice.op_schedule_advice import OpScheduleAdvice
+
+
+class TimelineAdviceFactory(AdviceFactory):
+ ADVICE_LIB = {
+ Constant.OPTIM: OptimizerAdvice,
+ Constant.OP_SCHE: OpScheduleAdvice,
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+ return self.ADVICE_LIB.get(advice)(self.collection_path).run()
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py b/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py
new file mode 100644
index 00000000000..8400fd5ecd1
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py
new file mode 100644
index 00000000000..e9be4675963
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import abstractmethod
+from common_func.constant import Constant
+from advice_base import AdviceBase
+from cluster_analysis import Interface
+
+
+class ClusterAdviceBase(AdviceBase):
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ @staticmethod
+ def compute_max_gap_ratio(data: list, mean: float):
+ if mean == 0:
+ return 0
+ else:
+ return (max(data) - min(data)) / mean
+
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+ for file in os.listdir(self.collection_path):
+ if file == 'cluster_analysis_output':
+ print("[INFO]Cluster has been analyzed "
+ "because of the existence of cluster analysis output directory.")
+ print("[INFO]Skip Cluster analyze backend.")
+ return
+ print("[INFO] cluster analysis is in the process, please wait...")
+ self.cluster_analyze()
+
+ def cluster_analyze(self):
+ parameter = {
+ Constant.COLLECTION_PATH: self.collection_path,
+ Constant.ANALYSIS_MODE: "all"
+ }
+ try:
+ Interface(parameter).run()
+ except Exception as e:
+ raise ValueError(f"Cluster analyze backend failed:{e}") from e
+
+ @abstractmethod
+ def run(self):
+ """
+ analyze profiling data and advice
+ """
+
+ @abstractmethod
+ def output(self):
+ """
+ output relevant data
+ """
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py
new file mode 100644
index 00000000000..7f8846f1d99
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py
@@ -0,0 +1,437 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import multiprocessing
+from typing import Dict
+from typing import Optional
+from typing import Deque
+from typing import List
+from typing import Tuple
+from collections import defaultdict
+from collections import deque
+from decimal import Decimal
+from dataclasses import dataclass
+
+from common_func.file_manager import FileManager
+from common_func_advisor.constant import Constant
+from common_func_advisor.trace_view_preprocessor import FineTraceViewData
+from common_func_advisor.trace_view_preprocessor import TraceViewPreProcessor
+from cluster_advice.cluster_advice_base import ClusterAdviceBase
+from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor
+
+
+@dataclass
+class PipelineTimeSlice:
+ start: str = ""
+ end: str = ""
+ slice_type: str = ""
+ bp_timeslice: list = None
+
+ def __post_init__(self):
+ self.bp_timeslice = self.bp_timeslice or []
+
+
+class PipelineTraceViewer:
+ STAGE_COLOR = "good"
+ BUBBLE_COLOR = "generic_work"
+ FP_COLOR = "good"
+ BP_COLOR = "bad"
+ PIPLINE_VIEW = "Pipeline View"
+ STAGE = "Stage"
+ BUBBLE = "Bubble"
+ FP = "FP"
+ BP = "BP"
+
+ COLORS = {
+ STAGE: STAGE_COLOR,
+ BUBBLE: BUBBLE_COLOR,
+ FP: FP_COLOR,
+ BP: BP_COLOR
+ }
+
+ def _gen_trace_pair(self, name: str, start_ts: str, end_ts: str, pid: str, tid: str) -> Dict:
+ data = {
+ Constant.OP_NAME: name,
+ Constant.CNAME: self.COLORS.get(name, self.BUBBLE),
+ Constant.PH: Constant.PH_X,
+ Constant.PID: pid,
+ Constant.OP_TID: tid,
+ Constant.TS: start_ts,
+ Constant.DUR: str(Decimal(end_ts) - Decimal(start_ts))
+ }
+
+ return data
+
+ def gen_stage_bubble_trace_data(self, rank_id: int, timeslice_list: List[PipelineTimeSlice]) -> List[Dict]:
+ """
+ generate stage bubble trace json data
+ """
+ rank_str = f'Rank {rank_id}'
+ trace_data = []
+
+ for timeslice in timeslice_list:
+ data = self._gen_trace_pair(timeslice.slice_type, timeslice.start,
+ timeslice.end, self.PIPLINE_VIEW, rank_str)
+ trace_data.append(data)
+
+ return trace_data
+
+ def gen_fp_bp_trace_data(self, rank_id: int, timeslice_list: List[PipelineTimeSlice]) -> List[Dict]:
+ """
+ generate fp bp trace json data
+ """
+ rank_str = f'Rank {rank_id}'
+ trace_data = []
+
+ for timeslice in timeslice_list:
+ if timeslice.slice_type == self.BUBBLE:
+ data = self._gen_trace_pair(timeslice.slice_type, timeslice.start,
+ timeslice.end, self.PIPLINE_VIEW, rank_str)
+ trace_data.append(data)
+ else:
+ last_end = timeslice.start
+ for bp_bound in timeslice.bp_timeslice:
+ data = self._gen_trace_pair(self.FP, last_end,
+ bp_bound[0], self.PIPLINE_VIEW, rank_str)
+ trace_data.append(data)
+ last_end = bp_bound[1]
+
+ data = self._gen_trace_pair(self.BP, bp_bound[0],
+ bp_bound[1], self.PIPLINE_VIEW, rank_str)
+ trace_data.append(data)
+
+ last_data = self._gen_trace_pair(self.FP, last_end,
+ timeslice.end, self.PIPLINE_VIEW, rank_str)
+ trace_data.append(last_data)
+
+ return trace_data
+
+
+class ClusterPipelineAdvice(ClusterAdviceBase):
+ BUBBLE = "Bubble"
+ STAGE = "Stage"
+ PIPELINE_VIEW = "Pipeline View"
+ SAVE_JSON = "pipeline_view.json"
+
+ def __init__(self, collection_path: str, kwargs: dict):
+ super().__init__(collection_path)
+ self.rank_ids = list(set(kwargs.get("rank_ids", [])))
+ self.worker_num = kwargs.get("worker_num", int(multiprocessing.cpu_count() / 2))
+ self.rank_prof_dirs = {}
+ self.cur_data = []
+ self.cur_bottleneck = {}
+ self.cur_advices = ""
+
+ def run(self) -> dict:
+ """
+ Unified entrance interface
+ """
+ self.rank_prof_dirs = self.get_rank_prof_dirs(self.rank_ids)
+ if not self.rank_prof_dirs:
+ print("[ERROR] No rank profiling data found, please check the rank ids or dir path.")
+ return {}
+
+ self.process()
+ self.output()
+ self.identify_bottleneck()
+ return self.output_format_data
+
+ def process(self) -> None:
+ """
+ process all rank profiling data by using multi-process
+ """
+ start_time = time.time()
+ print(f"[INFO] Start to process {len(self.rank_prof_dirs)} rank profiling data with {self.worker_num} workers.")
+ with multiprocessing.Pool(self.worker_num) as pool:
+ results = pool.map(self.work, self.rank_prof_dirs.items())
+
+ for (rank_id, _), (res, show_fp_bp) in zip(self.rank_prof_dirs.items(), results):
+ if show_fp_bp:
+ self.cur_data += PipelineTraceViewer().gen_fp_bp_trace_data(rank_id, res)
+ else:
+ self.cur_data += PipelineTraceViewer().gen_stage_bubble_trace_data(rank_id, res)
+ print(f"[INFO] Pipline view data process finished, cost {time.time() - start_time:.2f}s.")
+
+ @staticmethod
+ def _align_trace_bound(results: List) -> None:
+ """
+ align all rank trace bound for better visualization
+ """
+ start_list, end_list = [], []
+ for res in results:
+ start_list.append(res[0].start)
+ end_list.append(res[-1].end)
+
+ # update all rank trace bound
+ for res in results:
+ res[0].start = min(start_list)
+ res[-1].end = max(end_list)
+
+ def work(self, kv: Tuple[int, str]) -> Tuple[List[PipelineTimeSlice], bool]:
+ """
+ single process worker function
+ """
+ show_fp_bp = False
+ rank_id, rank_prof_dir = kv
+ print(f"[INFO] [Rank {rank_id}] Start to process rank profiling data.")
+ json_path = os.path.join(rank_prof_dir, Constant.ASCEND_PROFILER_OUTPUT, Constant.TRACE_VIEW_JSON)
+ fine_data = self.load_trace_view_data(json_path)
+ if not fine_data.hcom_ops or not fine_data.hcom_tids:
+ print(f"[ERROR] [Rank {rank_id}] No hcom send recv ops found, make sure the trace view data is pipeline "
+ f"parallel sense.")
+ return [], show_fp_bp
+
+ timeslice_list = self.get_pipeline_timeslice(fine_data.hcom_ops, fine_data.hcom_tids, fine_data.min_ts,
+ fine_data.max_ts)
+ if not fine_data.fp_ops or not fine_data.bp_ops:
+ print(f"[INFO] [Rank {rank_id}] No frameWork data in trace view, only show stage and bubble.")
+ elif len(fine_data.hcom_tids) > 1:
+ print(f"[WARN] [Rank {rank_id}] More than one hcom tid found, only show stage and bubble.")
+ else:
+ print(f"[INFO] [Rank {rank_id}] Found frameWork data in trace view, show fp bp and bubble.")
+ bp_ops = self.get_fp_bp_bound_ops(fine_data)
+ self.update_stage_fp_bp(timeslice_list, bp_ops)
+ show_fp_bp = True
+ print(f"[INFO] [Rank {rank_id}] Rank profiling data process finished.")
+
+ return timeslice_list, show_fp_bp
+
+ def identify_bottleneck(self) -> None:
+ pass
+
+ def output(self) -> None:
+ """
+ output result
+ """
+ self.cur_data.append(
+ {
+ Constant.OP_NAME: Constant.PROCESS_NAME,
+ Constant.PH: Constant.PH_META,
+ Constant.PID: self.PIPELINE_VIEW,
+ Constant.OP_TID: self.PIPELINE_VIEW,
+ Constant.ARGS: {
+ Constant.OP_NAME: self.PIPELINE_VIEW
+ }
+ }
+ )
+ self.output_format_data[self.DATA] = self.cur_data
+ self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck
+ self.output_format_data[self.ADVICE] = self.cur_advices
+
+ def get_rank_prof_dirs(self, rank_ids: list) -> Dict[int, str]:
+ """
+ get rank profiling directories by rank ids
+ """
+ rank_prof_dirs = defaultdict(str)
+ prof_dirs = []
+ for prof_dir in os.listdir(self.collection_path):
+ if prof_dir.endswith(Constant.PT_PROF_SUFFIX):
+ prof_dirs.append(os.path.join(self.collection_path, prof_dir))
+
+ data_map = PytorchDataPreprocessor(prof_dirs).get_data_map()
+ for rank_id in rank_ids:
+ if rank_id in data_map:
+ rank_prof_dirs[rank_id] = data_map[rank_id]
+ else:
+ print(f'[Warning] Rank {rank_id} not found in {self.collection_path}')
+
+ return rank_prof_dirs
+
+ @staticmethod
+ def load_trace_view_data(json_path) -> Optional[FineTraceViewData]:
+ """
+ load trace view data from json file and preprocess
+ """
+ raw_data = FileManager.read_json_file(json_path)
+ return TraceViewPreProcessor().process(raw_data)
+
+ @staticmethod
+ def double_queue_pop(fp_que: Deque[dict], bp_que: Deque[dict]) -> Tuple[list, list]:
+ """
+ double queue (fp and bp que) pop alternating algorithm implementation
+ """
+ res_fp_ops, res_bp_ops = [], []
+ pop_fp = fp_que[0][Constant.TS] < bp_que[0][Constant.TS]
+ fp_start_op, fp_end_op = fp_que[0], fp_que[0]
+ bp_start_op, bp_end_op = bp_que[0], bp_que[0]
+
+ def update_bound_op(que: Deque[dict], start_op: dict, end_op: dict) -> Tuple[dict, dict]:
+ """
+ update fp and bp bound op
+ """
+ op = que.popleft()
+ op_s = Decimal(op[Constant.TS])
+ op_e = op_s + Decimal(op[Constant.DUR])
+
+ start_op = op if op_s < Decimal(start_op[Constant.TS]) else start_op
+ end_op = op if op_e > Decimal(end_op[Constant.TS]) + Decimal(end_op[Constant.DUR]) else end_op
+
+ return start_op, end_op
+
+ while fp_que and bp_que:
+ if pop_fp:
+ if len(fp_que) > 1 and bp_que and fp_que[1][Constant.TS] > bp_que[0][Constant.TS]:
+ pop_fp = False # pop bp que
+ if len(fp_que) == 1:
+ pop_fp = False # pop bp que
+
+ fp_start_op, fp_end_op = update_bound_op(fp_que, fp_start_op, fp_end_op)
+
+ # time to pop bp que, need to record fp ops and update bp start op
+ if not pop_fp:
+ res_fp_ops.append((fp_start_op, fp_end_op))
+ if fp_que:
+ bp_start_op, bp_end_op = bp_que[0], bp_que[0]
+ else:
+ if len(bp_que) > 1 and fp_que and bp_que[1][Constant.TS] > fp_que[0][Constant.TS]:
+ pop_fp = True # pop fp que
+ if len(bp_que) == 1:
+ pop_fp = True # pop fp que
+
+ bp_start_op, bp_end_op = update_bound_op(bp_que, bp_start_op, bp_end_op)
+
+ # time to pop fp que, need to record bp ops and update fp start op
+ if pop_fp:
+ res_bp_ops.append((bp_start_op, bp_end_op))
+ if bp_que:
+ fp_start_op, fp_end_op = fp_que[0], fp_que[0]
+
+ if fp_que:
+ fp_start_op, fp_end_op = fp_que[0], fp_que[0]
+ while fp_que:
+ fp_start_op, fp_end_op = update_bound_op(fp_que, fp_start_op, fp_end_op)
+ res_fp_ops.append((fp_start_op, fp_end_op))
+
+ if bp_que:
+ bp_start_op, bp_end_op = bp_que[0], bp_que[0]
+ while bp_que:
+ bp_start_op, bp_end_op = update_bound_op(bp_que, bp_start_op, bp_end_op)
+ res_bp_ops.append((bp_start_op, bp_end_op))
+
+ return res_fp_ops, res_bp_ops
+
+ @staticmethod
+ def update_ops_time(ops_list: List[List[dict]], torch_to_npu_links: List[dict],
+ npu_ops_ts_dur: dict) -> List[List[dict]]:
+ """
+ update fp and bp bound ops time at device by using torch_to_npu_links
+ """
+ ops_que = deque(ops_list)
+ torch_to_npu_que = deque(torch_to_npu_links)
+ res = []
+ link_stack = []
+ while ops_que and torch_to_npu_que:
+ link = torch_to_npu_que.popleft()
+ link_s = Decimal(link[Constant.TS])
+
+ # bound op at framework level
+ cpu_op_l, cpu_op_r = ops_que[0][0], ops_que[0][1]
+ cpu_op_s = Decimal(cpu_op_l[Constant.TS])
+ cpu_op_e = Decimal(cpu_op_r[Constant.TS]) + Decimal(cpu_op_r[Constant.DUR])
+
+ if cpu_op_s < link_s < cpu_op_e:
+ link_stack.append(link)
+ if link_s > cpu_op_e or \
+ (link_stack and not torch_to_npu_que):
+ min_link = link_stack[0]
+ max_link = link_stack[-1]
+
+ min_link_s = str(min_link[Constant.ID])
+ max_link_s = str(max_link[Constant.ID])
+ # for compatibility with old data (ts is float type)
+ if isinstance(min_link[Constant.ID], float):
+ cpu_op_l["npu_op_ts"] = min_link_s
+ cpu_op_r["npu_op_ts"] = max_link_s
+ else:
+ cpu_op_l["npu_op_ts"] = f"{min_link_s[:-3]}.{min_link_s[-3:]}"
+ cpu_op_r["npu_op_ts"] = f"{max_link_s[:-3]}.{max_link_s[-3:]}"
+ cpu_op_l["npu_op_dur"] = npu_ops_ts_dur.get(cpu_op_l["npu_op_ts"], 0)
+ cpu_op_r["npu_op_dur"] = npu_ops_ts_dur.get(cpu_op_r["npu_op_ts"], 0)
+
+ res.append([cpu_op_l, cpu_op_r])
+ ops_que.popleft()
+ link_stack.clear()
+
+ return res
+
+ def get_fp_bp_bound_ops(self, fine_data: FineTraceViewData) -> List[List[dict]]:
+ """
+ get fp and bp bound ops by using double queue alternating pop algorithm and
+ update fp and bp bound ops time at device by using torch_to_npu_links
+ """
+ fp_que = deque(fine_data.fp_ops)
+ bp_que = deque(fine_data.bp_ops)
+
+ # get fp and bp bound ops
+ _, res_bp_ops = self.double_queue_pop(fp_que, bp_que)
+
+ # according to torch_to_npu_links, split fp and bp timeslice
+ bp_ops = self.update_ops_time(res_bp_ops, fine_data.torch_to_npu_links, fine_data.npu_ops_ts_dur)
+ return bp_ops
+
+ def get_pipeline_timeslice(self, hcom_ops: list, hcom_tids: list,
+ min_ts: str, max_ts: str) -> List[PipelineTimeSlice]:
+ """
+ get pipeline timeslice by using hcom ops
+ """
+ timeslice_list = []
+ last_op_end = None
+ if len(hcom_tids) > 1:
+ print("[WARN] More than one hcom tid found, default to show minimal tid pipeline view.")
+
+ for op in hcom_ops:
+ if op[Constant.OP_TID] == min(hcom_tids):
+ # gap between two hcom ops
+ if last_op_end:
+ timeslice_list.append(PipelineTimeSlice(str(last_op_end), op[Constant.TS], self.STAGE))
+ # hcom op
+ last_op_end = Decimal(op[Constant.TS]) + Decimal(op[Constant.DUR])
+ timeslice_list.append(PipelineTimeSlice(op[Constant.TS], str(last_op_end), self.BUBBLE))
+
+ # add start STAGE and end STAGE
+ timeslice_list.insert(0, PipelineTimeSlice(min_ts, timeslice_list[0].start, self.STAGE))
+ timeslice_list.insert(len(timeslice_list), PipelineTimeSlice(timeslice_list[-1].end, max_ts, self.STAGE))
+ return timeslice_list
+
+ def update_stage_fp_bp(self, timeslice_list: List[PipelineTimeSlice],
+ bp_ops: List[List[dict]]) -> None:
+ """
+ update stage fp and bp time
+ """
+ pipeline_que = deque(timeslice_list)
+ bp_bound_que = deque(bp_ops)
+
+ while pipeline_que and bp_bound_que:
+ while pipeline_que[0].slice_type != self.STAGE:
+ pipeline_que.popleft()
+ if not pipeline_que:
+ return None
+
+ bp_bound_data = bp_bound_que[0]
+ bp_bound_s = Decimal(bp_bound_data[0]['npu_op_ts'])
+ bp_bound_e = Decimal(bp_bound_data[1]['npu_op_ts']) + Decimal(bp_bound_data[1]['npu_op_dur'])
+
+ pipeline_s = Decimal(pipeline_que[0].start)
+ pipeline_e = Decimal(pipeline_que[0].end)
+
+ if pipeline_s <= bp_bound_s and bp_bound_e <= pipeline_e:
+ pipeline_que[0].bp_timeslice.append((str(bp_bound_s), str(bp_bound_e)))
+ bp_bound_que.popleft()
+ elif bp_bound_s > pipeline_e:
+ pipeline_que.popleft()
+ else:
+ bp_bound_que.popleft()
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py
new file mode 100644
index 00000000000..6fa83c765f5
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py
@@ -0,0 +1,62 @@
+import os
+import pandas as pd
+from common_func.path_manager import PathManager
+from common_func.constant import Constant
+from common_func_advisor.constant import Constant as AdvisorConstant
+from cluster_advice.cluster_advice_base import ClusterAdviceBase
+from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor
+
+
+class KernelClusterAdvice(ClusterAdviceBase):
+ COLUMNS_TO_GROUP = ["Name", "Input Shapes", "Input Data Types", "Output Shapes"]
+ COLUMNS_TO_CAL = ["Duration(us)"]
+ CAL_FUN = ['mean', 'var', 'max', 'min', 'count', 'sum']
+
+ def __init__(self, collection_path: str, kwargs: dict = None):
+ super().__init__(collection_path)
+ self.all_kernel_data = pd.DataFrame()
+
+ def run(self):
+ self.load_kernel_details_data()
+ return self.calculate_data()
+
+ def load_kernel_details_data(self):
+ prof_dirs = self.get_prof_dirs(self.collection_path)
+ if not prof_dirs:
+ msg = "[ERROR] There is no profile in this collection path, terminate analysis."
+ raise RuntimeError(msg)
+
+ data_map = PytorchDataPreprocessor(prof_dirs).get_data_map()
+ self.all_kernel_data = pd.DataFrame()
+ for rank_id, profiling_dir_path in data_map.items():
+ kernel_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.KERNEL_DETAILS_CSV)
+ if kernel_file:
+ # 判断csv文件大小
+ PathManager.check_path_readable(kernel_file)
+ # 读取CSV文件
+ df_temp = pd.read_csv(kernel_file)
+ columns_to_keep = self.COLUMNS_TO_GROUP + self.COLUMNS_TO_CAL
+ if [1 for element in columns_to_keep if element not in list(df_temp)]:
+ msg = "[ERROR] Kernel details.csv has wrong data columns, terminate analysis."
+ raise RuntimeError(msg)
+ df = df_temp[columns_to_keep]
+ df.insert(loc=0, column='rank id', value=rank_id)
+ # 将数据添加到最终的数据框中
+ self.all_kernel_data = pd.concat([self.all_kernel_data, df], ignore_index=True)
+
+ def calculate_data(self):
+ # 存储所有合并后的数据
+ calculate_dict = {self.COLUMNS_TO_CAL[i]: self.CAL_FUN
+ for i in range(len(self.COLUMNS_TO_CAL))}
+ group_col = ["rank id"] + self.COLUMNS_TO_GROUP
+ view_data = self.all_kernel_data.groupby(group_col).agg(calculate_dict).reset_index()
+ view_data.columns = [''.join(col) if col[1] == "" else '_'.join(col) for col in view_data.columns]
+ return view_data
+
+ def get_prof_dirs(self, collection_path):
+ prof_dirs = []
+ for prof_dir in os.listdir(collection_path):
+ if prof_dir.endswith(AdvisorConstant.PT_PROF_SUFFIX):
+ prof_dirs.append(os.path.join(collection_path, prof_dir))
+
+ return prof_dirs
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py
new file mode 100644
index 00000000000..f8a625242f3
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import defaultdict
+from common_func_advisor.constant import Constant
+from common_func.file_manager import FileManager
+from cluster_advice.cluster_advice_base import ClusterAdviceBase
+
+
+class SlowLinkAdvice(ClusterAdviceBase):
+ RDMA_TIME_MS = "RDMA time(ms)"
+ RDMA_SIZE_MB = "RDMA size(mb)"
+ SDMA_TIME_MS = "SDMA time(ms)"
+ SDMA_SIZE_MB = "SDMA size(mb)"
+ RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)"
+ SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)"
+ COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info"
+ TRANSIT_TIME = "Transit Time(ms)"
+ TRANSIT_SIZE = "Transit Size(MB)"
+ SDMA = "SDMA"
+ RDMA = "RDMA"
+
+ def __init__(self, collection_path: str, kwargs: dict = None):
+ super().__init__(collection_path)
+ self.rank_bw_dict = defaultdict(lambda: {
+ self.RDMA_TIME_MS: 0,
+ self.RDMA_SIZE_MB: 0,
+ self.SDMA_TIME_MS: 0,
+ self.SDMA_SIZE_MB: 0,
+ })
+
+ @staticmethod
+ def compute_ratio(dividend: float, divisor: float):
+ if abs(divisor) < 1e-15:
+ return 0
+ else:
+ return round(dividend / divisor, 4)
+
+ def load_communication_json(self):
+ json_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT, Constant.CLUSTER_COMM_JSON)
+ if not os.path.exists(json_path):
+ msg = "[ERROR] cluster_communication.json doesn't exist, terminate analysis."
+ raise RuntimeError(msg)
+ communication_json = FileManager.read_json_file(json_path)
+ return communication_json
+
+ def run(self):
+ self.path_check()
+ communication_json = self.load_communication_json()
+ self.process(communication_json)
+ self.output()
+ return self.output_format_data
+
+ def process(self, communication_json: dict):
+ for comm_group, group_dict in communication_json.items():
+ for step, step_dict in group_dict.items():
+ for op, op_dict in step_dict.items():
+ self.compute_bandwidth(op_dict)
+ if self.rank_bw_dict:
+ self.produce_bottleneck(self.RDMA_BANDWIDTH)
+ self.produce_bottleneck(self.SDMA_BANDWIDTH)
+
+ def compute_bandwidth(self, op_dict: dict):
+ for rank_id, rank_dict in op_dict.items():
+ try:
+ rank = int(rank_id)
+ except ValueError as e:
+ msg = "[ERROR] Cluster_communication.json has invalid structure."
+ raise ValueError(msg) from e
+ for comm_type, bw_dict in rank_dict.get(self.COMMUNICATION_BANDWIDTH_INFO, {}).items():
+ if comm_type == self.SDMA:
+ self.rank_bw_dict[rank][self.SDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE)
+ self.rank_bw_dict[rank][self.SDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME)
+ if comm_type == self.RDMA:
+ self.rank_bw_dict[rank][self.RDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE)
+ self.rank_bw_dict[rank][self.RDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME)
+
+ for rank, rank_dict in self.rank_bw_dict.items():
+ self.rank_bw_dict[rank][self.RDMA_BANDWIDTH] = self.compute_ratio(
+ self.rank_bw_dict[rank][self.RDMA_SIZE_MB], self.rank_bw_dict[rank][self.RDMA_TIME_MS])
+ self.rank_bw_dict[rank][self.SDMA_BANDWIDTH] = self.compute_ratio(
+ self.rank_bw_dict[rank][self.SDMA_SIZE_MB], self.rank_bw_dict[rank][self.SDMA_TIME_MS])
+
+ def produce_bottleneck(self, link_type: str):
+ data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()]
+ avg_bw = round(sum(data_list) / len(data_list), 3)
+ if avg_bw == 0:
+ return
+ self.bottelneck += f'{link_type}: \n' \
+ f'The average is {avg_bw}, ' \
+ f'while the maximum is {round(max(data_list), 3)}GB/s and ' \
+ f'the minimum is {round(min(data_list), 3)}GB/s. ' \
+ f'the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n'
+
+ def output(self):
+ self.output_format_data[self.DATA] = self.rank_bw_dict
+ self.output_format_data[self.BOTTLENECK] = self.bottelneck
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py
new file mode 100644
index 00000000000..4e789fb7fb6
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import defaultdict
+from common_func_advisor.constant import Constant
+from common_func.file_manager import FileManager
+from cluster_advice.cluster_advice_base import ClusterAdviceBase
+from prof_bean_advisor.cluster_step_trace_time_bean import ClusterStepTraceTimeBean
+
+
+class SlowRankAdvice(ClusterAdviceBase):
+ RANK = "rank"
+ RATIO_THRESHOLD = 0.05
+ BOTTLENECK_LIST = ['Computing', 'Communication', "Free"]
+
+ def __init__(self, collection_path: str, kwargs: dict = None):
+ super().__init__(collection_path)
+
+ def load_step_time(self):
+ csv_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT, Constant.CLUSTER_STEP_TIME_CSV)
+ if not os.path.exists(csv_path):
+ msg = "[ERROR] cluster_step_trace_time.csv doesn't exist, terminate analysis."
+ raise RuntimeError(msg)
+ step_time = FileManager.read_csv_file(csv_path, ClusterStepTraceTimeBean)
+ return step_time
+
+ def run(self):
+ self.path_check()
+ step_data = self.load_step_time()
+ step_dict = self.process(step_data)
+ self.output(step_dict)
+ return self.output_format_data
+
+ def process(self, step_data: list):
+ step_dict = defaultdict(lambda: [0, 0, 0, 0])
+ for step_bean in step_data:
+ if step_bean.type == self.RANK:
+ step_dict[step_bean.index][0] += step_bean.compute
+ step_dict[step_bean.index][1] += step_bean.communication
+ step_dict[step_bean.index][2] += step_bean.free
+ total_time_list = [sum(data_tuple) for rank_id, data_tuple in step_dict.items()]
+ if total_time_list:
+ mean_total_time = sum(total_time_list) / len(total_time_list)
+ for i in range(len(self.BOTTLENECK_LIST)):
+ self.produce_bottleneck(step_dict, i, mean_total_time)
+ return step_dict
+
+ def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float):
+ data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()]
+ max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time)
+ if max_ratio > self.RATIO_THRESHOLD:
+ self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} has some issues in the cluster, ' \
+ f'because the max difference of {self.BOTTLENECK_LIST[produce_type]} time ' \
+ f'has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n'
+
+ def output(self, step_dict: dict):
+ self.output_format_data[self.DATA] = step_dict
+ self.output_format_data[self.BOTTLENECK] = self.bottelneck
diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py b/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py
new file mode 100644
index 00000000000..8400fd5ecd1
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py b/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py
new file mode 100644
index 00000000000..46a7fb24c2d
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+
+
+class CsvTitle:
+ MODEL_NAME = "Model Name"
+ MODEL_ID = "Model ID"
+ TASK_ID = "Task ID"
+ STREAM_ID = "Stream ID"
+ INFER_ID = "Infer ID"
+ TASK_START_TIME = "Task Start Time(us)"
+ TASK_WAIT_TIME = "Task Wait Time(us)"
+ BLOCK_DIM = "Block Dim"
+ MIX_BLOCK_DIM = "Mix Block Dim"
+ HF32_ELIGIBLE = "HF32 Eligible"
+ INPUT_SHAPES = "Input Shapes"
+ INPUT_DATA_TYPES = "Input Data Types"
+ INPUT_FORMATS = "Input Formats"
+ OUTPUT_SHAPES = "Output Shapes"
+ OUTPUT_DATA_TYPES = "Output Data Types"
+ OUTPUT_FORMATS = "Output Formats"
+ CONTEXT_ID = "Context ID"
+ AICORE_TIME = "aicore_time(us)"
+ AIC_TOTAL_CYCLES = "aic_total_cycles"
+ AIC_MAC_TIME = "aic_mac_time(us)"
+ AIC_MAC_RATIO = "aic_mac_ratio"
+ AIC_SCALAR_TIME = "aic_scalar_time(us)"
+ AIC_SCALAR_RATIO = "aic_scalar_ratio"
+ AIC_MTE1_TIME = "aic_mte1_time(us)"
+ AIC_MTE1_RATIO = "aic_mte1_ratio"
+ AIC_MTE2_TIME = "aic_mte2_time(us)"
+ AIC_MTE2_RATIO = "aic_mte2_ratio"
+ AIC_FIXPIPE_TIME = "aic_fixpipe_time(us)"
+ AIC_FIXPIPE_RATIO = "aic_fixpipe_ratio"
+ AIC_ICACHE_MISS_RATE = "aic_icache_miss_rate"
+ AIV_TIME = "aiv_time(us)"
+ AIV_TOTAL_CYCLES = "aiv_total_cycles"
+ AIV_VEC_TIME = "aiv_vec_time(us)"
+ AIV_VEC_RATIO = "aiv_vec_ratio"
+ AIV_SCALAR_TIME = "aiv_scalar_time(us)"
+ AIV_SCALAR_RATIO = "aiv_scalar_ratio"
+ AIV_MTE2_TIME = "aiv_mte2_time(us)"
+ AIV_MTE2_RATIO = "aiv_mte2_ratio"
+ AIV_MTE3_TIME = "aiv_mte3_time(us)"
+ AIV_MTE3_RATIO = "aiv_mte3_ratio"
+ AIV_ICACHE_MISS_RATE = "aiv_icache_miss_rate"
+ CUBE_UTILIZATION = "cube_utilization( %)"
+ TASK_DURATION_SUM = "Task Duration Sum(us)"
+ TASK_DURATION_MEAN = "Task Duration Mean(us)"
+ TASK_DURATION_STD = "Task Duration Std(us)"
+ TASK_DURATION_RATIO = "Task Duration Ratio(100%)"
+ SIZE = "size(MB)"
+ THROUGHPUT = "throughput(GB/s)"
+ COLOR = "color"
+ GAP = "Gap(us)"
+ DURATION_SUM = "Duration Sum(us)"
+ COUNT = "Count"
+ MAX_DURATION = "Max Duration(us)"
+ MIN_DURATION = "Min Duration(us)"
+ AVG_DURATION = "Avg Duration(us)"
+ DURATION_RATIO = "Duration Ratio"
+ INDEX = "Index"
+
+
+# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配旧版csv
+class CsvTitleV1(CsvTitle):
+ OP_NAME = "Op Name"
+ OP_TYPE = "OP Type"
+ TASK_TYPE = "Task Type"
+ TASK_DURATION = "Task Duration(us)"
+
+
+# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配新版csv
+class CsvTitleV2(CsvTitle):
+ OP_NAME = "Name"
+ OP_TYPE = "Type"
+ TASK_TYPE = "Accelerator Core"
+ TASK_DURATION = "Duration(us)"
+
+
+class Constant:
+ DTYPE_SIZE_MAP = {"int8": 1, "uint8": 1,
+ "int16": 2, "uint16": 2,
+ "int32": 4, "uint32": 4,
+ "int64": 8, "uint64": 8,
+ "float16": 2,
+ "bfloat16": 2,
+ "bf16": 2,
+ "dt_bf16": 2,
+ "float32": 4,
+ "float": 4,
+ "float64": 8,
+ "complex64": 8,
+ "complex128": 16,
+ "bool": 1}
+ TP_THRESHOLD = 1150
+ MAX_INPUT_MODE_LEN = 30
+ MAX_INPUT_ADVICE_LEN = 30
+ SMALL_OP_DUR_RATIO = 0.2
+ SMALL_OP_NUM_RATIO = 0.2
+ BYTE_UNIT_TRANS = 1024
+ UNIT_TRANS = 1000
+
+ # mode list
+ COMPUTE = "compute"
+ TIMELINE = "timeline"
+ CLUSTER = "cluster"
+ OVERALL = "overall"
+ PIPELINE = "pipeline"
+
+ # advice list
+ SLOW_RANK = "slow rank"
+ SLOW_LINK = "slow link"
+ KERNEL = "kernel"
+
+ # compute
+ NPU_FUSED = "npu_fused"
+ NPU_SLOW = "npu_slow"
+
+ # timeline
+ OPTIM = "optimizer"
+ OP_SCHE = "op_schedule"
+
+ # overall
+ SUMMARY = "summary"
+
+ PT_PROF_SUFFIX = "ascend_pt"
+ ASCEND_PROFILER_OUTPUT = "ASCEND_PROFILER_OUTPUT"
+ COLLECTION_PATH = "collection_path"
+ CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output"
+ KERNEL_DETAILS_CSV = "kernel_details.csv"
+ CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv"
+ CLUSTER_COMM_JSON = "cluster_communication.json"
+
+ # pipline
+ OP_NAME = "name"
+ OP_TID = "tid"
+ PID = "pid"
+ TS = "ts"
+ DUR = "dur"
+ CAT = "cat"
+ ARGS = "args"
+ PH = "ph"
+ ID = "id"
+ PH_START = "s"
+ PH_BEGIN = "B"
+ PH_END = "E"
+ PH_META = "M"
+ PH_X = "X"
+ CNAME = "cname"
+ PROCESS_NAME = "process_name"
+ FRAMEWORK_NAME = "Python"
+ ASCEND_HARDWARE_NAME = "Ascend Hardware"
+ ASYNC_NPU = "async_npu"
+ STEP_PREFIX = "ProfilerStep#"
+ FP_ATEN_OP = "aten"
+ FP_C10D_OP = "c10d"
+ HCOM_OP_PREFIX = "hcom_"
+ BP_AUTOGRAD_OP = "autograd"
+ TRACE_VIEW_JSON = "trace_view.json"
+
+ # pattern_dict key: pattern, value: pattern name
+ PATTERN_DICT = {("Add", "DropOutDoMask", "Add"): "bias_dropout_add",
+ ("BatchMatMul", "Mul", "Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast", "DropOutDoMask",
+ "AsStrided", "BatchMatMul", "Transpose"): "FA",
+ ("Transpose", "Transpose", "Transpose", "Mul", "Transpose", "BatchMatMulV2", "MaskedFill",
+ "Cast", "SoftmaxV2", "Cast", "DropOutDoMask", "BatchMatMulV2", "Transpose"): "FA",
+ ("Transpose", "BatchMatMulV2", "Transpose", "Transpose", "BatchMatMulV2", "ZerosLike",
+ "DropOutDoMask", "Cast", "SoftmaxGrad", "Cast", "MaskedFill", "BatchMatMulV2",
+ "BatchMatMulV2", "Mul"): "FA",
+ ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Cast", "Cast", "Mul", "Cast", "Cast",
+ "Mul", "Cast"): "RMSNORM",
+ ("Cast", "LayerNorm", "Cast"): "LayerNorm",
+ ("Add", "LayerNorm"): "AddLayerNorm",
+ ("Add", "LayerNormV3"): "AddLayerNorm",
+ ("Gelu", "Add"): "GeluAdd",
+ ("Cast", "Square", "MemSet", "ReduceMean", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "RMSNorm",
+ ("BatchMatMul", "RealDiv", "Add", "Maximum", "SoftmaxV2", "Cast", "BatchMatMul"): "FA",
+ ("BatchMatMulV2", "RealDiv", "Add", "Cast", "Maximum", "Cast", "SoftmaxV2", "AsStrided",
+ "BatchMatMulV2"): "FA",
+ ("BatchMatMulV2", "RealDiv", "Add", "Cast", "SoftmaxV2", "Cast", "BroadcastTo",
+ "BatchMatMulV2"): "FA",
+ ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Cast", "Mul", "Add"): "RotaryMul",
+ ("Mul", "AsStrided", "Neg", "AsStrided", "ConcatD", "Mul", "Add"): "RotaryMul",
+ ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul", "Add"): "RotaryMul",
+ ("MatMulV2", "Swish", "MatMulV2", "Mul", "MatMulV2"): "FFN",
+ ("Transpose", "Transpose", "GatherElement", "Transpose"): "GatherElement",
+ ("Slice", "Slice", "Swish", "Mul"): "torch_npu.npu_swiglu",
+ ("Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast"): "torch_npu.npu_scaled_masked_softmax",
+ ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul"): "torch_npu.npu_rotary_mul",
+ ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "torch_npu.npu_rms_norm"}
+ TITLE = CsvTitleV2
+
+ @classmethod
+ def update_title(cls):
+ cls.TITLE = CsvTitleV1
+
+
+class CoreType:
+ AIV = "AI_VECTOR_CORE"
+ AIC = "AI_CORE"
+ AICPU = "AI_CPU"
+ MIX_AIV = "MIX_AIV"
+ MIX_AIC = "MIX_AIC"
+ HCCL = "HCCL"
+
+
+class PerfColor(Enum):
+ WHITE = 0
+ GREEN = 1
+ YELLOW = 2
+ RED = 3
diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py
new file mode 100644
index 00000000000..8171f06ee23
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from abc import abstractmethod
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Dict
+from typing import List
+
+import pandas as pd
+
+from common_func.file_manager import FileManager
+
+
+@dataclass
+class TraceObj:
+ ph: str = ""
+ bp: str = ""
+ cat: str = ""
+ name: str = ""
+ pid: int = 0
+ tid: int = 0
+ id: int = 0
+ ts: str = ""
+ dur: float = 0.0
+ args: dict = field(default='unknown')
+
+ @abstractmethod
+ def hash(self):
+ raise Exception("To be implemented")
+
+ def valid(self):
+ return self.name != ""
+
+ def check_hashable(self):
+ if not self.valid():
+ raise Exception("Illegal {} to hash".format(self.__class__.name))
+
+
+@dataclass
+class Process(TraceObj):
+ def hash(self):
+ self.check_hashable()
+ # msprof 保证name唯一性
+ return self.args.get("name")
+
+
+@dataclass
+class Thread(TraceObj):
+ def hash(self):
+ self.check_hashable()
+ # msprof 保证name唯一性
+ return self.args.get("name")
+
+
+@dataclass
+class DurationEvent(TraceObj):
+ def hash(self):
+ self.check_hashable()
+ return self.ts
+
+
+@dataclass
+class FlowEvent(TraceObj):
+ s_point_ts: str = ""
+ e_point_ts: str = ""
+
+ def hash(self):
+ self.check_hashable()
+ return self.e_point_ts
+
+
+class TraceViewJson:
+
+ def __init__(self, path):
+ self.processes: Dict[str, Process] = dict()
+ self.threads: Dict[str, Thread] = dict()
+ self.python_dur_events: Dict[str, DurationEvent] = dict()
+ self.cann_dur_events: Dict[str, DurationEvent] = dict()
+ self.ascend_hardware_dur_events: Dict[str, DurationEvent] = dict()
+ self.torch_2_npu_flow_events: Dict[str, FlowEvent] = dict()
+ traces = FileManager.read_json_file(path)
+ self._load_obj(traces)
+
+ def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str:
+ if ts_col not in data.columns.tolist():
+ print("[ERROR] No {} col found in data columns.".format(ts_col))
+ return ""
+ row = data.loc[index_id]
+ timestamp = row[ts_col]
+ flow_event = self.get_torch_2_npu_flow_event(timestamp)
+ if not flow_event.valid():
+ print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern']))
+ return ""
+ flow_event_s_key = flow_event.s_point_ts
+ python_dur_events = self.get_python_dur_events_contain_ts(flow_event_s_key)
+ if not python_dur_events:
+ print("[ERROR] No python dur event found for pattern {}.".format(row['pattern']))
+ return ""
+ # 保持新老版本callstack兼容性
+ if python_dur_events[0].args.get("Call stack"):
+ # 旧版本
+ call_stack_list = python_dur_events[0].args.get("Call stack").split(";")
+ else:
+ python_dur_events.sort(key=lambda e: e.ts)
+ # 新版本
+ call_stack_list = [event.name for event in python_dur_events if event.cat == "python_function"]
+ call_stack = "\n".join(call_stack_list)
+ return call_stack
+
+ def get_torch_2_npu_flow_event(self, end_time) -> FlowEvent:
+ if not self.torch_2_npu_flow_events or not self.torch_2_npu_flow_events.get(end_time):
+ print("[ERROR] Find flow event failed for ts: {}".format(end_time))
+ return FlowEvent()
+ return self.torch_2_npu_flow_events.get(end_time)
+
+ def get_python_dur_events_contain_ts(self, ts) -> List[DurationEvent]:
+ res = []
+ for event in self.python_dur_events.values():
+ if float(event.ts) <= float(ts) <= float(event.ts) + event.dur:
+ res.append(event)
+ return res
+
+ def _load_obj(self, traces):
+ self._load_format(traces)
+ if not self._check_format():
+ print("[ERROR] parse json failed for error format")
+ return
+ self._load_duration_events(traces)
+ self._load_torch_to_npu_flow_events(traces)
+
+ def _check_format(self):
+ # 当前功能只需要这两个process,可扩展
+ check_processes = ['Python', 'Ascend Hardware']
+ for check_process in check_processes:
+ if check_process in self.processes:
+ continue
+ print("[ERROR] {} process not found in json.".format(check_process))
+ return False
+ return True
+
+ # 加载pid, tid头
+ def _load_format(self, traces: List[Dict]):
+ for i, trace in enumerate(traces):
+ if trace.get('name') == 'process_name':
+ if not trace.get('args') or not trace.get('args').get('name') or not trace.get('pid'):
+ continue
+ process = Process(**trace)
+ self.processes[process.hash()] = process
+ if trace.get('name') == 'thread_name':
+ if not trace.get('args') or not trace.get('args').get('name') or not trace.get('tid'):
+ continue
+ thread = Thread(**trace)
+ self.threads[thread.hash()] = thread
+
+ def _load_duration_events(self, traces: List[Dict]):
+ def check_events(_trace):
+ return _trace.get('name') and _trace.get("ts") and _trace.get("dur")
+
+ python_pid = self.processes.get("Python").pid
+ cann_pid = self.processes.get("CANN").pid
+ ascend_hardware_pid = self.processes.get("Ascend Hardware").pid
+ for i, trace in enumerate(traces):
+ if trace.get('ph') != 'X':
+ continue
+ if not check_events(trace):
+ continue
+ event = DurationEvent(**trace)
+ if trace.get('pid') == python_pid:
+ self.python_dur_events[event.hash()] = event
+ elif trace.get('pid') == cann_pid:
+ self.cann_dur_events[event.hash()] = event
+ elif trace.get("pid") == ascend_hardware_pid:
+ self.ascend_hardware_dur_events[event.hash()] = event
+
+ def _load_torch_to_npu_flow_events(self, traces: List[Dict]):
+ def check_events(_trace):
+ return _trace.get('name') and _trace.get("id") and _trace.get("ts")
+
+ flow_events_table_by_id = dict()
+
+ python_pid = self.processes.get("Python")
+ for i, trace in enumerate(traces):
+ if trace.get('ph') != 's' and trace.get('ph') != 'f' and trace.get('pid') != python_pid:
+ continue
+ if not check_events(trace):
+ continue
+ event = flow_events_table_by_id.get(trace.get("id"))
+ if not event:
+ event = FlowEvent(**trace)
+ if trace.get('ph') == 's':
+ event.s_point_ts = trace.get('ts')
+ else:
+ event.e_point_ts = trace.get('ts')
+ flow_events_table_by_id[event.id] = event
+
+ self.torch_2_npu_flow_events = {eve.hash(): eve for eve in flow_events_table_by_id.values()}
diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py
new file mode 100644
index 00000000000..7b9baa32d94
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+from typing import Optional
+from dataclasses import dataclass
+
+from common_func_advisor.constant import Constant
+
+
+@dataclass
+class FineTraceViewData:
+ py_pid: int = -1
+ fp_tid: int = -1
+ bp_tid: int = -1
+ ascend_pid: int = -1
+ min_ts: str = str(sys.maxsize)
+ max_ts: str = "0"
+ hcom_tids: list = None
+ fp_ops: list = None
+ bp_ops: list = None
+ hcom_ops: list = None
+ npu_ops_ts_dur: dict = None
+ torch_to_npu_links: list = None
+
+ def __post_init__(self):
+ self.hcom_tids = self.hcom_tids or []
+ self.fp_ops = self.fp_ops or []
+ self.bp_ops = self.bp_ops or []
+ self.hcom_ops = self.hcom_ops or []
+ self.npu_ops_ts_dur = self.npu_ops_ts_dur or {}
+ self.torch_to_npu_links = self.torch_to_npu_links or []
+
+ def sort(self):
+ self.fp_ops.sort(key=lambda x: x[Constant.TS])
+ self.bp_ops.sort(key=lambda x: x[Constant.TS])
+ self.hcom_ops.sort(key=lambda x: x[Constant.TS])
+ self.torch_to_npu_links.sort(key=lambda x: x[Constant.TS])
+
+
+class TraceViewPreProcessor:
+ """
+ Trace view data preprocess
+ """
+
+ @staticmethod
+ def _is_fp_op(op_name: str) -> bool:
+ """
+ check whether op is fp op
+ """
+ return op_name.startswith(Constant.FP_ATEN_OP) or op_name.startswith(Constant.FP_C10D_OP)
+
+ @staticmethod
+ def _is_fp_data(data: dict, fp_tid: int, py_pid: int) -> bool:
+ """
+ check whether data is valid fp data
+ """
+ return data[Constant.OP_TID] == fp_tid and \
+ Constant.TS in data and Constant.DUR in data and \
+ not data[Constant.OP_NAME].startswith(Constant.STEP_PREFIX) and \
+ data[Constant.PID] == py_pid
+
+ @staticmethod
+ def _is_bp_op(op_name: str) -> bool:
+ """
+ check whether op is bp op
+ """
+ return op_name.startswith(Constant.BP_AUTOGRAD_OP)
+
+ @staticmethod
+ def _is_bp_data(data: dict, bp_tid: int, py_pid: int) -> bool:
+ """
+ check whether data is valid bp data
+ """
+ return data[Constant.OP_TID] == bp_tid and \
+ Constant.TS in data and Constant.DUR in data and \
+ data[Constant.PID] == py_pid
+
+ @staticmethod
+ def _is_torch_to_npu_link(data: dict, fp_tid: int) -> bool:
+ """
+ check whether data is torch to npu link
+ """
+ return Constant.CAT in data and data[Constant.CAT] == Constant.ASYNC_NPU and \
+ data[Constant.PH] == Constant.PH_START and \
+ data[Constant.PID] == fp_tid
+
+ @staticmethod
+ def _is_send_recv_op(op_name: str) -> bool:
+ """
+ check whether op is hcom send or recv op
+ """
+ # eg: hcom_BatchSendRecv__101_0_1
+ p1 = re.compile(r'hcom_\w+SendRecv__\d+')
+ # eg: hcom_send__101_0_1
+ p2 = re.compile(r'hcom_send__\d+')
+ # eg: hcom_receive__101_0_1
+ p3 = re.compile(r'hcom_receive__\d+')
+ return bool(p1.match(op_name)) or bool(p2.match(op_name)) or bool(p3.match(op_name))
+
+ @staticmethod
+ def _is_hcom_op(op_name: str) -> bool:
+ """
+ check whether data is hcom data
+ """
+ return op_name.startswith(Constant.HCOM_OP_PREFIX)
+
+ @staticmethod
+ def _is_python_process(data: dict) -> bool:
+ """
+ check whether data is python process
+ """
+ return Constant.PH in data and data[Constant.PH] == Constant.PH_META and \
+ data[Constant.OP_NAME] == Constant.PROCESS_NAME and \
+ data[Constant.ARGS][Constant.OP_NAME] == Constant.FRAMEWORK_NAME
+
+ @staticmethod
+ def _is_step_op(data: dict) -> bool:
+ """
+ check whether data is step data
+ """
+ return data[Constant.OP_NAME].startswith(Constant.STEP_PREFIX)
+
+ @staticmethod
+ def _is_ascend_process(data: dict) -> bool:
+ """
+ check whether data is ascend process data
+ """
+ return Constant.PH in data and data[Constant.PH] == Constant.PH_META and \
+ data[Constant.OP_NAME] == Constant.PROCESS_NAME and \
+ data[Constant.ARGS][Constant.OP_NAME] == Constant.ASCEND_HARDWARE_NAME
+
+ @staticmethod
+ def _is_npu_op(data: dict, ascend_pid: int) -> bool:
+ """
+ check whether data is npu op
+ """
+ return Constant.PH in data and data[Constant.PH] == Constant.PH_X and \
+ not data[Constant.OP_NAME].isupper() and \
+ data[Constant.PID] == ascend_pid
+
+ def process(self, raw_data: list) -> Optional[FineTraceViewData]:
+ """
+ preprocess raw data
+ """
+ if not raw_data:
+ print("[ERROR] No raw data found in trace view data.")
+ return None
+
+ raw_fp_tids, raw_bp_tids, raw_hcom_tids = set(), set(), set()
+ fine_data = FineTraceViewData()
+
+ # counting fp ops and bp ops tid and ascend pid
+ for data in raw_data:
+ if self._is_fp_op(data[Constant.OP_NAME]):
+ raw_fp_tids.add(data[Constant.OP_TID])
+ elif self._is_bp_op(data[Constant.OP_NAME]):
+ raw_bp_tids.add(data[Constant.OP_TID])
+ elif self._is_send_recv_op(data[Constant.OP_NAME]):
+ fine_data.hcom_ops.append(data)
+ raw_hcom_tids.add(data[Constant.OP_TID])
+ elif self._is_python_process(data):
+ fine_data.py_pid = data[Constant.PID]
+ elif self._is_ascend_process(data):
+ fine_data.ascend_pid = data[Constant.PID]
+
+ # find max and min ts in hcom ops
+ if self._is_hcom_op(data[Constant.OP_NAME]):
+ # for compatibility with old data (ts is float type)
+ ts = data[Constant.TS] if not isinstance(data[Constant.TS], float) else str(data[Constant.TS])
+ fine_data.min_ts = min(fine_data.min_ts, ts)
+ fine_data.max_ts = max(fine_data.max_ts, ts)
+
+ unique_fp_tid = list(raw_fp_tids - raw_bp_tids)
+ unique_bp_tid = list(raw_bp_tids)
+ fine_data.hcom_tids = list(raw_hcom_tids)
+
+ if not unique_fp_tid or not unique_bp_tid:
+ print("[INFO] No fp or bp tid found in trace view data.")
+ else:
+ fine_data.fp_tid, fine_data.bp_tid = unique_fp_tid[0], unique_bp_tid[0]
+
+ # filter fp ops and bp ops and torch_to_npu_links
+ for data in raw_data:
+ if self._is_fp_data(data, fine_data.fp_tid, fine_data.py_pid):
+ fine_data.fp_ops.append(data)
+ elif self._is_bp_data(data, fine_data.bp_tid, fine_data.py_pid):
+ fine_data.bp_ops.append(data)
+ elif self._is_torch_to_npu_link(data, fine_data.fp_tid):
+ fine_data.torch_to_npu_links.append(data)
+ elif self._is_npu_op(data, fine_data.ascend_pid):
+ fine_data.npu_ops_ts_dur[data[Constant.TS]] = data[Constant.DUR]
+
+ fine_data.sort()
+ return fine_data
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/__init__.py b/profiler/advisor_review/advisor_backend/compute_advice/__init__.py
new file mode 100644
index 00000000000..8400fd5ecd1
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py b/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py
new file mode 100644
index 00000000000..cafbafd8e28
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from collections import defaultdict
+import os
+
+from advice_base import AdviceBase
+from common_func.file_manager import FileManager
+
+
+class ComputeAdviceBase(AdviceBase):
+ ASCEND_PT = 'ascend_pt'
+ ASCEND_PROFILER_OUTPUT = 'ASCEND_PROFILER_OUTPUT'
+ KERNEL_DETAIL_FILE = "kernel_details.csv"
+ TRACE_VIEW_FILE = "trace_view.json"
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.kernel_details_path = ""
+ self.has_preparse = False
+ self.preparse_data = defaultdict(list)
+ self.call_stack = None
+ self.trace_view_path = ""
+
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+ if not os.path.exists(self.collection_path):
+ print("[ERROR] Path: {} is not exist.".format(self.collection_path))
+ return False
+ if os.path.isdir(self.collection_path) and self.collection_path.endswith("ascend_pt"):
+ self.kernel_details_path = os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT",
+ "kernel_details.csv")
+ if not os.path.exists(self.kernel_details_path):
+ print("[ERROR] kernel_details.csv is not exist in the Path: {}.".format(
+ os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT")))
+ return False
+ elif os.path.isfile(self.collection_path) and os.path.basename(self.collection_path) == "kernel_details.csv":
+ self.kernel_details_path = self.collection_path
+ else:
+ print("[ERROR] Please input ascend_pt or kernel_details.csv")
+ return False
+ print("[INFO] Start to analyse the target file: {}".format(self.kernel_details_path))
+ self.preparse()
+ return True
+
+ def has_callstack(self):
+ if self.call_stack is not None:
+ return self.call_stack
+ profiler_info_json_path = ""
+ for file in os.listdir(self.collection_path):
+ if file.startswith("profiler_info"):
+ profiler_info_json_path = os.path.join(self.collection_path, file)
+ break
+ if not profiler_info_json_path:
+ self.call_stack = False
+ return self.call_stack
+ self.trace_view_path = os.path.join(self.collection_path, self.ASCEND_PROFILER_OUTPUT, "trace_view.json")
+ if not os.path.exists(profiler_info_json_path) or not os.path.exists(self.trace_view_path):
+ self.call_stack = False
+ return self.call_stack
+ info = FileManager.read_json_file(profiler_info_json_path)
+ if not info.get("config") or not info.get("config").get("common_config") \
+ or not info.get("config").get("common_config").get("with_stack"):
+ self.call_stack = False
+ return self.call_stack
+ activities = info.get("config").get("common_config").get("activities")
+ if not activities or "ProfilerActivity.CPU" not in activities:
+ self.call_stack = False
+ return self.call_stack
+ self.call_stack = info.get("config").get("common_config").get("with_stack")
+ return self.call_stack
+
+ @abstractmethod
+ def run(self):
+ """
+ analyze profiling data and advice
+ """
+
+ @abstractmethod
+ def output(self):
+ """
+ output relevant data
+ """
+ self.output_format_data[self.DATA] = self.cur_data
+ self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck
+ self.output_format_data[self.ADVICE] = self.cur_advice
+
+ def preparse(self):
+ if self.has_preparse:
+ return
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py
new file mode 100644
index 00000000000..8400fd5ecd1
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py
new file mode 100644
index 00000000000..c85c14d618c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+
+import pandas as pd
+import numpy as np
+
+from common_func_advisor.constant import Constant
+from .op_perf import OpPerfFactory
+
+
+class CSVAnalyzer:
+ def __init__(self, path) -> None:
+ self._path = path
+
+ def process(self):
+ df = pd.read_csv(self._path, dtype={"Start Time(us)": str})
+ # 分析是否存在可融合的算子
+ op_type_list = df["Type"].tolist()
+ duration_list = df["Duration(us)"].tolist()
+ start_times = df["Start Time(us)"].tolist()
+ # 去除末尾的\t分隔符
+ start_times = [start_time[:-1] for start_time in start_times]
+ result_list = []
+ for pattern in Constant.PATTERN_DICT.keys():
+ result_list.extend(self.find_all_sub_lists(op_type_list, duration_list, start_times, pattern))
+ data_frame = pd.DataFrame(result_list)
+ data_frame.columns = ["pattern_name", "pattern", "len", "count", "duration sum(us)", "op durations(us)",
+ "index", "first_timestamp"]
+ return data_frame
+
+ @staticmethod
+ def find_all_sub_lists(op_type_list, duration_list, start_times, expect_sub_list):
+ # 创建一个空字典,用来存储子列表和它们的出现次数和起始位置
+ len_sub_list = len(expect_sub_list)
+ expect_sub_list = tuple(expect_sub_list)
+ sublist_dict = {}
+ # 遍历列表,从每个位置开始,取长度为N的子列表
+ for i in range(len(op_type_list) - len_sub_list + 1):
+ sublist = tuple(op_type_list[i:i + len_sub_list])
+ if sublist != expect_sub_list:
+ continue
+ # 如果子列表已经在字典中,就增加它的出现次数,否则就初始化为1
+ if sublist in sublist_dict:
+ # count
+ sublist_dict[sublist][0] += 1
+ # index
+ sublist_dict[sublist][1].append(i)
+ # total duration
+ sublist_dict[sublist][2] += sum(duration_list[i:i + len_sub_list])
+ # duration
+ zip_data = zip(sublist_dict[sublist][3], duration_list[i:i + len_sub_list])
+ sublist_dict[sublist][3] = [a + b for a, b in zip_data]
+ else:
+ sublist_dict[sublist] = [1, [i], sum(duration_list[i:i + len_sub_list]),
+ duration_list[i:i + len_sub_list], len_sub_list, start_times[i]]
+ # 创建一个空列表,用来存储所有重复的子列表
+ repeated_sublists = []
+ for sublist, (count, index, duration_sum, op_durations, sublist_len, first_time) in sublist_dict.items():
+ pattern_name = Constant.PATTERN_DICT.get(sublist, "unknown")
+ op_durations = [round(num, 2) for num in op_durations]
+ repeated_sublists.append([pattern_name, sublist, sublist_len, count,
+ duration_sum, op_durations, index, first_time])
+ if len(sublist_dict) == 0:
+ pattern_name = Constant.PATTERN_DICT.get(expect_sub_list, "unknown")
+ repeated_sublists.append([pattern_name, expect_sub_list, 0, 0, 0, 0, 0, 0])
+ # 返回所有重复的子列表
+ return repeated_sublists
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py
new file mode 100644
index 00000000000..fd2a72ffa39
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from common_func_advisor.trace_view_json import TraceViewJson
+
+
+class JSONAnalyzer(object):
+ def __init__(self, path):
+ self._path = path
+
+ def get_custom_code(self, data: pd.DataFrame, ts_col: str, output_col: str):
+ trace_json = TraceViewJson(self._path)
+ callstacks = pd.DataFrame(columns=[output_col])
+
+ for i, row in data.iterrows():
+ if ts_col not in data.columns.tolist():
+ print("[ERROR] No {} col found in data columns.".format(ts_col))
+ return callstacks
+ timestamp = row[ts_col]
+ flow_event = trace_json.get_torch_2_npu_flow_event(timestamp)
+ if not flow_event.valid():
+ print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern']))
+ callstacks.loc[i] = ""
+ continue
+ flow_event_s_key = flow_event.s_point_ts
+ python_dur_events = trace_json.get_python_dur_events_contain_ts(flow_event_s_key)
+ if not python_dur_events:
+ print("[ERROR] No python dur event found for pattern {}.".format(row['pattern']))
+ callstacks.loc[i] = ""
+ continue
+ # 保持新老版本callstack兼容性
+ if python_dur_events[0].args.get("Call stack"):
+ # 旧版本
+ callstack = python_dur_events[0].args.get("Call stack").split(";")
+ else:
+ python_dur_events.sort(key=lambda e: e.ts)
+ # 新版本
+ callstack = [event.name for event in python_dur_events if event.cat == "python_function"]
+ callstack_str = "\n".join(callstack)
+ callstacks.loc[i] = callstack_str
+ return callstacks
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py
new file mode 100644
index 00000000000..7bcbed5a758
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+from typing import Dict
+
+from common_func_advisor.constant import Constant
+from common_func_advisor.constant import CoreType
+from common_func_advisor.constant import PerfColor
+
+
+class OpPerfFactory:
+ @classmethod
+ def build(cls, op_row: Dict):
+ if op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIV:
+ return VecOpPerf(op_row)
+ elif op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIC:
+ return CubeOpPerf(op_row)
+ else:
+ return OpPerf(op_row)
+
+
+class OpPerf:
+ def __init__(self, op_row: Dict):
+ if "OP Type" in op_row.keys():
+ Constant.update_title()
+ self.row = op_row
+ self.model_name = op_row.get("Model Name")
+ self.model_id = op_row.get("Model ID")
+ self.task_id = op_row.get("Task ID")
+ self.stream_id = op_row.get("Stream ID")
+ self.infer_id = op_row.get("Infer ID")
+ self.op_name = op_row.get("Name")
+ self.op_type = op_row.get("Type")
+ self.task_type = op_row.get("Accelerator Core")
+ self.task_start_time = op_row.get("Start Time(us)")
+ self.task_duration = op_row.get("Duration(us)")
+ self.task_wait_time = op_row.get("Wait Time(us)")
+ self.block_dim = op_row.get("Block Dim")
+ self.mix_block_dim = op_row.get("Mix Block Dim")
+
+ self.hf32_eligible = op_row.get("HF32 Eligible")
+ self.input_shapes = op_row.get("Input Shapes")
+ self.input_data_types = op_row.get("Input Data Types")
+ self.input_formats = op_row.get("Input Formats")
+ self.output_shapes = op_row.get("Output Shapes")
+ self.output_data_types = op_row.get("Output Data Types")
+ self.output_formats = op_row.get("Output Formats")
+ self.context_id = op_row.get("Context ID")
+ self.aicore_time = op_row.get("aicore_time(us)")
+ self.aic_total_cycles = op_row.get("aic_total_cycles")
+
+ self.aic_mac_time = op_row.get("aic_mac_time(us)")
+ self.aic_mac_ratio = op_row.get("aic_mac_ratio")
+ self.aic_scalar_time = op_row.get("aic_scalar_time(us)")
+ self.aic_scalar_ratio = op_row.get("aic_scalar_ratio")
+ self.aic_mte1_time = op_row.get("aic_mte1_time(us)")
+ self.aic_mte1_ratio = op_row.get("aic_mte1_ratio")
+ self.aic_mte2_time = op_row.get("aic_mte2_time(us)")
+ self.aic_mte2_ratio = op_row.get("aic_mte2_ratio")
+ self.aic_fixpipe_time = op_row.get("aic_fixpipe_time(us)")
+ self.aic_fixpipe_ratio = op_row.get("aic_fixpipe_ratio")
+ self.aic_icache_miss_rate = op_row.get("aic_icache_miss_rate")
+ self.aiv_time = op_row.get("aiv_time(us)")
+ self.aiv_total_cycles = op_row.get("aiv_total_cycles")
+ self.aiv_vec_time = op_row.get("aiv_vec_time(us)")
+ self.aiv_vec_ratio = op_row.get("aiv_vec_ratio")
+ self.aiv_scalar_time = op_row.get("aiv_scalar_time(us)")
+ self.aiv_scalar_ratio = op_row.get("aiv_scalar_ratio")
+ self.aiv_mte2_time = op_row.get("aiv_mte2_time(us)")
+
+ self.aiv_mte2_ratio = op_row.get("aiv_mte2_ratio")
+ self.aiv_mte3_time = op_row.get("aiv_mte3_time(us)")
+ self.aiv_mte3_ratio = op_row.get("aiv_mte3_ratio")
+ self.aiv_icache_miss_rate = op_row.get("aiv_icache_miss_rate")
+ self.cube_utilization = op_row.get("cube_utilization( %)")
+
+ @staticmethod
+ def get_dtype_size(dtype_str: str):
+ return Constant.DTYPE_SIZE_MAP.get(dtype_str.lower(), 0)
+
+ @staticmethod
+ def get_element_count(shape: list):
+ return functools.reduce(lambda x, y: int(x) * int(y), shape)
+
+ @staticmethod
+ def shape_to_tuple(shape_str: str) -> tuple:
+ if not isinstance(shape_str, str):
+ return []
+ shape_str = shape_str.strip('"')
+ split_shape = shape_str.strip(';')
+ if not split_shape:
+ return []
+ pairs = split_shape.split(';')
+ shape_result = []
+ for pair in pairs:
+ pair = pair.strip(";")
+ elements = pair.split(',')
+ elements = tuple(int(element) if "" != element else 0 for element in elements)
+ shape_result.append(elements)
+ return tuple(shape_result)
+
+ @staticmethod
+ def dtype_to_tuple(dtypes_str: str) -> tuple:
+ if not isinstance(dtypes_str, str):
+ return []
+ dtypes_str = dtypes_str.strip('"')
+ split_dtypes = dtypes_str.strip(';')
+ if not split_dtypes:
+ return []
+ pairs = split_dtypes.split(';')
+ return tuple(pairs)
+
+ def get_mac_ratio(self):
+ return self.aic_mac_ratio
+
+ def get_size(self, shapes_str, dtypes_str):
+ shapes = self.shape_to_tuple(shapes_str)
+ dtypes = self.dtype_to_tuple(dtypes_str)
+ if len(shapes) > len(dtypes):
+ print(f"[ERROR] The size of shape is greater than that of dtypes.")
+ return 0
+ if len(shapes) < len(dtypes):
+ shapes = list(shapes)
+ shapes.extend([(1,)] * (len(dtypes) - len(shapes)))
+ all_size = 0
+ for index, shape in enumerate(shapes):
+ element_count = self.get_element_count(shape)
+ dtype_size = self.get_dtype_size(dtypes[index])
+ all_size += element_count * dtype_size
+ return all_size
+
+ def get_calc_size(self):
+ # input and output bytes (MB)
+ if not self.input_shapes or not self.output_shapes:
+ print("[ERROR] There is no tensor data, do not assess vector op performance.")
+ return 0
+ intput_size = self.get_size(self.input_shapes, self.input_data_types)
+ output_size = self.get_size(self.output_shapes, self.output_data_types)
+ return (intput_size + output_size) / (Constant.BYTE_UNIT_TRANS * Constant.BYTE_UNIT_TRANS)
+
+ def get_throughput(self):
+ # throughput(GB/s)
+ if not self.task_duration or abs(self.task_duration) < 1e-6:
+ print("[ERROR] There is no task_duration, do not assess vector op performance.")
+ return 0
+ return self.row[Constant.TITLE.SIZE] / Constant.BYTE_UNIT_TRANS / self.task_duration * Constant.UNIT_TRANS * Constant.UNIT_TRANS
+
+ def get_perf_color(self):
+ return PerfColor.WHITE
+
+ def update(self):
+ self.row[Constant.TITLE.SIZE] = self.get_calc_size()
+ self.row[Constant.TITLE.THROUGHPUT] = self.get_throughput()
+ self.row[Constant.TITLE.COLOR] = self.get_perf_color().name
+ return self.row
+
+
+class VecOpPerf(OpPerf):
+ def get_perf_color(self) -> PerfColor:
+ throughput = self.row[Constant.TITLE.THROUGHPUT]
+ op_duration = self.task_duration
+ tp_threshold = Constant.TP_THRESHOLD
+ if throughput == 0:
+ return PerfColor.WHITE
+ if throughput < tp_threshold / 2 and op_duration > 20:
+ return PerfColor.RED
+ elif tp_threshold / 2 <= throughput < tp_threshold:
+ return PerfColor.YELLOW
+ else:
+ return PerfColor.GREEN
+
+
+class CubeOpPerf(OpPerf):
+ def get_perf_color(self) -> PerfColor:
+ aic_mac_ratio = self.get_mac_ratio()
+ if not aic_mac_ratio:
+ print("[WARNING] There is no aic_mac_ratio, do not assess cube op performance.")
+ return PerfColor.WHITE
+ elif aic_mac_ratio < 0.6:
+ return PerfColor.RED
+ elif 0.6 <= aic_mac_ratio < 0.8:
+ return PerfColor.YELLOW
+ else:
+ return PerfColor.GREEN
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py
new file mode 100644
index 00000000000..fd5610bbbbb
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import ABC
+
+import pandas as pd
+
+from compute_advice.compute_advice_base import ComputeAdviceBase
+from compute_advice.npu_fused.csv_analyzer import CSVAnalyzer
+from compute_advice.npu_fused.json_analyzer import JSONAnalyzer
+
+
+class NpuFusedAdvice(ComputeAdviceBase, ABC):
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.cur_data = dict()
+ self.cur_bottleneck = str()
+ self.cur_advice = str()
+ self.kernel_details_path = ""
+ self.call_stack = None
+
+ def run(self):
+ if not self.path_check():
+ return self.output_format_data
+ self.process()
+ self.output()
+ return self.output_format_data
+
+ def process(self):
+ csv_analyzer = CSVAnalyzer(self.kernel_details_path)
+ all_pattern_data = csv_analyzer.process()
+ all_pattern_data = all_pattern_data.sort_values(by='duration sum(us)', ascending=False)
+ filter_data = all_pattern_data.get(all_pattern_data.get("duration sum(us)", 0) > 0)
+ if not self.has_callstack():
+ print("[Warning] No call stack info found, advice will be incomplete")
+ self.cur_data = filter_data
+ else:
+ json_analyzer = JSONAnalyzer(self.trace_view_path)
+ custom_code = json_analyzer.get_custom_code(filter_data, "first_timestamp", "custom code")
+ self.cur_data = pd.concat([filter_data, custom_code], axis=1)
+ op_num = len(self.cur_data.index)
+ op_dur = filter_data["duration sum(us)"].sum()
+ if op_num > 0:
+ index = 0
+ self.cur_bottleneck = f"The computing time of fusable op is {round(op_dur, 2)} ms."
+ self.cur_advice = ""
+ for _, row in self.cur_data.iterrows():
+ advice = f"Advice {index}:\n"
+ cur_op = "[" + ", ".join(row.loc["pattern"]) + "]"
+ npu_fused_op = row.loc["pattern_name"]
+ advice += f"Replace {cur_op} with {npu_fused_op}. "
+ if self.call_stack:
+ advice += f"This pattern first happened in: \n{row['custom code']}"
+ if index != op_num - 1:
+ advice += "\n"
+ index += 1
+ self.cur_advice += advice
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py
new file mode 100644
index 00000000000..caff1c792c2
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+import multiprocessing
+
+import pandas as pd
+
+from compute_advice.compute_advice_base import ComputeAdviceBase
+from compute_advice.npu_fused.op_perf import OpPerfFactory
+from common_func_advisor.constant import Constant
+from common_func_advisor.constant import PerfColor
+from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson
+
+
+class NpuSlowAdvice(ComputeAdviceBase, ABC):
+ OP_PERF_SHEET = "op_perf"
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.kernel_details_path = ""
+ self.data = pd.DataFrame()
+
+ @staticmethod
+ def save_to_excel(data: pd.DataFrame, file_path: str) -> None:
+ writer = pd.ExcelWriter(file_path, engine="xlsxwriter", mode="w")
+ data.index.name = Constant.TITLE.INDEX
+ data.to_excel(writer, index=True, sheet_name=NpuSlowAdvice.OP_PERF_SHEET)
+ NpuSlowAdvice.color_sheet(data, writer.book, writer.sheets[NpuSlowAdvice.OP_PERF_SHEET])
+ writer.sheets[NpuSlowAdvice.OP_PERF_SHEET].freeze_panes = "A2"
+ writer.close()
+
+ @staticmethod
+ def color_sheet(data: pd.DataFrame, workbook, worksheet):
+ color_rgb = {
+ PerfColor.GREEN.name: workbook.add_format({'bg_color': '#C6EFCE'}),
+ PerfColor.YELLOW.name: workbook.add_format({'bg_color': '#FFEB9C'}),
+ PerfColor.RED.name: workbook.add_format({'bg_color': '#FFC7CE'}),
+ }
+ for row in data.iterrows():
+ color = row[1][Constant.TITLE.COLOR]
+ fill_format = color_rgb.get(color)
+ if not fill_format:
+ continue
+ worksheet.set_row(row[0] + 1, None, fill_format)
+
+ @staticmethod
+ def update_op_row(row: tuple):
+ return OpPerfFactory.build(row[1]).update()
+
+ def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str:
+ if not self.has_callstack():
+ print("There is no call stack info, please set 'with_stack=True'")
+ return ""
+ trace_json = TraceViewJson(self.trace_view_path)
+ return trace_json.get_call_stack(data, index_id, ts_col)
+
+ def run(self):
+ if not self.path_check():
+ return self.data
+ self.process()
+ return self.data
+
+ def process(self):
+ self.data = pd.read_csv(self.kernel_details_path, dtype={"Start Time(us)": str})
+ # 去除末尾的\t分隔符
+ self.data["Start Time(us)"] = self.data["Start Time(us)"].apply(lambda x: x[:-1])
+ pool = multiprocessing.Pool(multiprocessing.cpu_count())
+ result = pool.map(self.update_op_row, self.data.iterrows())
+ pool.close()
+ self.data = pd.DataFrame(result)
diff --git a/profiler/advisor_review/advisor_backend/interface.py b/profiler/advisor_review/advisor_backend/interface.py
new file mode 100644
index 00000000000..3e20c26d4d7
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/interface.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+
+sys.path.append(
+ os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "advisor_backend"))
+sys.path.append(
+ os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "compare_tools"))
+sys.path.append(
+ os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "cluster_analyse"))
+from common_func_advisor.constant import Constant
+from advisor_backend.advice_factory.cluster_advice_factory import ClusterAdviceFactory
+from advisor_backend.advice_factory.compute_advice_factory import ComputeAdviceFactory
+from advisor_backend.advice_factory.timeline_advice_factory import TimelineAdviceFactory
+from advisor_backend.advice_factory.overall_advice_factory import OverallAdviceFactory
+
+
+class Interface:
+ def __init__(self, collection_path: str):
+ self.collection_path = os.path.realpath(collection_path)
+ self._factory_controller = FactoryController(collection_path)
+
+ def get_data(self: any, mode: str, advice: str, **kwargs):
+ if len(mode) > Constant.MAX_INPUT_MODE_LEN or len(advice) > Constant.MAX_INPUT_ADVICE_LEN:
+ msg = '[ERROR]Input Mode is illegal.'
+ raise RuntimeError(msg)
+ factory = self._factory_controller.create_advice_factory(mode, kwargs.get("input_path", ""))
+ return factory.produce_advice(advice, kwargs)
+
+
+class FactoryController:
+ FACTORY_LIB = {
+ Constant.CLUSTER: ClusterAdviceFactory,
+ Constant.COMPUTE: ComputeAdviceFactory,
+ Constant.TIMELINE: TimelineAdviceFactory,
+ Constant.OVERALL: OverallAdviceFactory
+ }
+
+ def __init__(self, collection_path: str):
+ self.collection_path = os.path.realpath(collection_path)
+ self.temp_input_path = None
+
+ def create_advice_factory(self, mode: str, input_path: str):
+ collection_path = input_path if input_path else self.collection_path
+ return self.FACTORY_LIB.get(mode)(collection_path)
+
+
+if __name__ == "__main__":
+ Interface()
diff --git a/profiler/advisor_review/advisor_backend/overall_advice/__init__.py b/profiler/advisor_review/advisor_backend/overall_advice/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py
new file mode 100644
index 00000000000..f5bfc351f28
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from advisor_backend.advice_base import AdviceBase
+from compare_backend.utils.constant import Constant
+from compare_interface.comparison_interface import ComparisonInterface
+
+
+class OverallSummaryAdvice(AdviceBase):
+ advice_map = {
+ "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.",
+ "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.",
+ "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule."
+ }
+ time_name_map = {
+ "Computing Time": "computing",
+ "Uncovered Communication Time": "communication",
+ "Free Time": "free",
+ 'Cube Time(Num)': 'Cube Time',
+ 'Vector Time(Num)': 'Vector Time',
+ 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)',
+ 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)',
+ 'Other Time': "Other Computing Time",
+ 'SDMA Time(Num)': 'SDMA Time'
+ }
+ performance_time_dict = {
+ "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)',
+ 'Flash Attention Time(Backward)(Num)', 'Other Time'],
+ "Uncovered Communication Time(Wait Time)": [],
+ "Free Time": ['SDMA Time(Num)']
+ }
+
+ def __init__(self, collection_path: str, kwargs: dict):
+ super().__init__(collection_path)
+ self.base_collection_path = kwargs.get("base_collection_path", "")
+ self._has_base_collection = False
+ self._is_minimal_profiling = False
+ self.cur_data = {}
+ self.cur_bottleneck = {}
+ self.cur_advices = ""
+ self._headers = []
+ self._base_data = []
+ self._comparison_data = []
+
+ @staticmethod
+ def split_duration_and_num(time_value: str) -> tuple:
+ split_data = time_value.split("s") # time value example: 0.229s(1756)
+ duration, num = 0.0, None
+ if len(split_data) >= 2:
+ try:
+ num = int(split_data[1].strip("()"))
+ except ValueError:
+ pass
+ if len(split_data) >= 1:
+ try:
+ duration = float(split_data[0])
+ except ValueError:
+ print(f"[WARNING] Invalid time value: {time_value}.")
+ return duration, num
+
+ @staticmethod
+ def calculate_ratio(dividend, divisor):
+ if not divisor:
+ return float("inf")
+ return dividend / divisor
+
+ def run(self):
+ if self.path_check():
+ self.process()
+ self.output()
+ self.identify_bottleneck()
+ return self.output_format_data
+
+ def path_check(self):
+ if self.base_collection_path:
+ if os.path.exists(self.base_collection_path):
+ self._has_base_collection = True
+ else:
+ print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.")
+ return os.path.exists(self.collection_path)
+
+ def process(self):
+ base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path
+ result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE)
+ for data in result_data.values():
+ self._headers = data.get("headers", [])
+ rows = data.get("rows", [])
+ if len(rows) == 2:
+ self._base_data = rows[0]
+ self._comparison_data = rows[1]
+ if not self._headers or not self._comparison_data:
+ return
+ self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers
+ if self._has_base_collection:
+ self.cur_data["comparison_result"] = result_data
+ time_category_dict = {}
+ for time_category, time_list in self.performance_time_dict.items():
+ time_value = self.get_time_value(time_category, self._comparison_data)
+ if time_value == Constant.INVALID_VALUE:
+ continue
+ duration, _ = self.split_duration_and_num(time_value)
+ time_category = time_category.split("(")[0]
+ time_category_dict[time_category] = duration
+ self.get_sub_category_time(time_category, time_list, duration)
+ self.cur_data["overall_data"] = time_category_dict
+
+ def get_time_value(self, header_name: str, data_list: list):
+ try:
+ data_index = self._headers.index(header_name)
+ except ValueError:
+ return Constant.INVALID_VALUE
+ try:
+ time_value = data_list[data_index]
+ except IndexError:
+ return Constant.INVALID_VALUE
+ return time_value
+
+ def get_sub_category_time(self, category: str, time_list: list, total_duration: float):
+ sub_time_dict = {}
+ for time_name in time_list:
+ time_value = self.get_time_value(time_name, self._comparison_data)
+ if time_value == Constant.INVALID_VALUE:
+ continue
+ sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, ""))
+ duration, num = self.split_duration_and_num(time_value)
+ sub_time_dict.setdefault(f"Duration(s)", []).append(duration)
+ sub_time_dict.setdefault(f"Duration Ratio", []).append(
+ "{:.2%}".format(self.calculate_ratio(duration, total_duration)))
+ sub_time_dict.setdefault(f"Kernel Number", []).append(num)
+ self.cur_data[self.time_name_map.get(category)] = sub_time_dict
+
+ def identify_bottleneck(self):
+ overall_data = self.cur_data.get("overall_data")
+ if not overall_data:
+ return
+ e2e_time = '%.3f' % sum([data for data in overall_data.values()])
+ overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n"
+ comparison_bottleneck = ""
+ for time_type, time_value in overall_data.items():
+ # add subtype time bottleneck
+ advice = self.advice_map.get(time_type, "")
+ self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n{advice}"
+ # add overall bottleneck
+ overall_bottleneck += f" -- {time_type} is {time_value}s\n"
+ if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value,
+ e2e_time) > 0.1:
+ overall_bottleneck += "percentage of free time exceed the threshold 10%."
+ if not self._has_base_collection:
+ continue
+ # add comparison bottleneck
+ time_type_origin = "Uncovered Communication Time(Wait Time)" \
+ if time_type == "Uncovered Communication Time" else time_type
+ base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data))
+ if time_value > base_duration:
+ ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration))
+ comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n"
+ self.cur_bottleneck["overall_data"] = overall_bottleneck
+ self.cur_bottleneck["comparison_result"] = comparison_bottleneck
+
+ def output(self):
+ self.output_format_data[self.DATA] = self.cur_data
+ self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck
+ self.output_format_data[self.ADVICE] = self.cur_advices
diff --git a/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py b/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py
new file mode 100644
index 00000000000..8400fd5ecd1
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py b/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py
new file mode 100644
index 00000000000..b108fc77a3f
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ClusterStepTraceTimeBean:
+ STEP = "Step"
+ TYPE = "Type"
+ INDEX = "Index"
+ COMPUTING = "Computing"
+ COMMUNICATION = "Communication(Not Overlapped)"
+ FREE = "Free"
+
+ def __init__(self, data: dict):
+ self._data = data
+
+ @property
+ def step(self) -> str:
+ return self._data.get(self.STEP, '')
+
+ @property
+ def type(self) -> str:
+ return self._data.get(self.TYPE, '')
+
+ @property
+ def index(self) -> int:
+ try:
+ return int(self._data.get(self.INDEX))
+ except ValueError as e:
+ msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Index'."
+ raise ValueError(msg) from e
+
+ @property
+ def compute(self) -> float:
+ try:
+ return float(self._data.get(self.COMPUTING, ''))
+ except ValueError as e:
+ msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Computing'."
+ raise ValueError(msg) from e
+
+ @property
+ def communication(self) -> float:
+ try:
+ return float(self._data.get(self.COMMUNICATION, ''))
+ except ValueError as e:
+ msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Communication'."
+ raise ValueError(msg) from e
+
+ @property
+ def free(self) -> float:
+ try:
+ return float(self._data.get(self.FREE, ''))
+ except ValueError as e:
+ msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Free'."
+ raise ValueError(msg) from e
+
diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py b/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py
new file mode 100644
index 00000000000..8400fd5ecd1
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py b/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py
new file mode 100644
index 00000000000..9e492b2156c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from decimal import Decimal
+from common_func_advisor.constant import Constant
+from timeline_advice.timeline_advice_base import TimelineAdviceBase
+
+
+class OpScheduleAdvice(TimelineAdviceBase):
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.cur_data = list()
+ self.cur_bottleneck = str()
+ self.cur_advice = str()
+
+ def run(self):
+ if not self.path_check():
+ return self.output_format_data
+ self.preparse()
+ self.process()
+ self.output()
+ return self.output_format_data
+
+ def process(self):
+ cpt_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CPT]
+ free_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_FREE]
+ if not cpt_data or not free_data:
+ print("[ERROR] Fail to find Overlap data.")
+ return
+
+ op_dur = [entry.get("dur", 0) for entry in cpt_data]
+ op_free = [0.0] * len(cpt_data)
+ merge_data = list()
+ merge_data.extend(cpt_data)
+ merge_data.extend(free_data)
+ merge_data.sort(key=lambda x : Decimal(x.get("ts")))
+ idx = free_idx = 0
+ while idx < len(merge_data) and free_idx < len(op_free):
+ entry = merge_data[idx]
+ entry_name = entry.get("name")
+ if entry_name == 'Free':
+ op_free[free_idx] = merge_data[idx].get('dur')
+ elif entry_name == 'Computing':
+ free_idx += 1
+ idx += 1
+ self.cur_data.append(op_dur)
+ self.cur_data.append(op_free)
+ free_ratio, cpt_ratio, _ = self.get_ratio()
+ if free_ratio < 0.2:
+ return
+ self.cur_bottleneck = f"NPU Utilication: {round(free_ratio * 100, 2)}%, " \
+ f"NPU Free Utilization: {round(cpt_ratio * 100, 2)}%."
+ if len(self.preparse_data[self.PREPARSE_TYPE.SYNCHRONIZE]) > 1:
+ self.cur_advice = f"Device synchronize {len(self.preparse_data[self.PREPARSE_TYPE.SYNCHRONIZE])} times, " \
+ "try to reduce synchronization statements to alleviate the bottleneck of operator delivery.\n"
+ small_op_num = self.small_op_block(op_free, op_dur)
+ small_op_ratio = small_op_num / len(op_dur) if op_dur else 0.0
+ if small_op_ratio > Constant.SMALL_OP_NUM_RATIO:
+ self.cur_advice += "There are too many small operators, you can increase the batch size appropriately."
+
+ def small_op_block(self, op_frees, op_durs):
+ small_op_num = 0
+ for op_free, op_dur in zip(op_frees, op_durs):
+ if op_free > op_dur * Constant.SMALL_OP_DUR_RATIO:
+ small_op_num += 1
+ return small_op_num
+
+ def get_ratio(self):
+ cpt_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CPT]
+ free_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_FREE]
+ cmu_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CMU]
+ cpt_time = sum([x.get("dur", 0) for x in cpt_data])
+ free_time = sum([x.get("dur", 0) for x in free_data])
+ cmu_time = sum([x.get("dur", 0) for x in cmu_data])
+ total_time = cpt_time + free_time + cmu_time
+ if total_time > 0.0:
+ return (free_time / total_time, cpt_time / total_time, cmu_time / total_time)
+ return (0.0, 0.0, 0.0)
diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py b/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py
new file mode 100644
index 00000000000..dee2e7ba563
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from timeline_advice.timeline_advice_base import TimelineAdviceBase
+
+
+class OptimizerAdvice(TimelineAdviceBase):
+ OPTIMIZER_MAP = {
+ "Optimizer.step#SGD.step": "torch_npu.optim.NpuFusedSGD",
+ "Optimizer.step#Adadelta.step": "torch_npu.optim.NpuFusedAdadelta",
+ "Optimizer.step#Lamb.step": "torch_npu.optim.NpuFusedLamb",
+ "Optimizer.step#Adam.step": "torch_npu.optim.NpuFusedAdam",
+ "Optimizer.step#AdamW.step": "torch_npu.optim.NpuFusedAdamW",
+ "Optimizer.step#AdamP.step": "torch_npu.optim.NpuFusedAdamP",
+ "Optimizer.step#BertAdam.step": "torch_npu.optim.NpuFusedBertAdam",
+ "Optimizer.step#RMSprop.step": "torch_npu.optim.NpuFusedRMSprop",
+ "Optimizer.step#RMSpropTF.step": "torch_npu.optim.NpuFusedRMSpropTF",
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.cur_data = list()
+ self.cur_bottleneck = str()
+ self.cur_advice = str()
+
+ def run(self):
+ if not self.path_check():
+ return self.output_format_data
+ self.preparse()
+ self.process()
+ self.output()
+ return self.output_format_data
+
+ def process(self):
+ if not self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER]:
+ return
+
+ self.cur_data = list(set([entry.get("name", None) for entry in self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER]]))
+ for index, opt_name in enumerate(self.cur_data):
+ self.cur_advice += f"You can choose {self.OPTIMIZER_MAP.get(opt_name)} to replace the current Optimizer: {opt_name}."
+ if index != len(self.cur_data) - 1:
+ self.cur_advice += "\n"
+ self.cur_bottleneck = self.cur_advice
diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py b/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py
new file mode 100644
index 00000000000..4c7ac96cd22
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from collections import defaultdict
+import json
+import os
+
+from advice_base import AdviceBase
+from common_func.file_manager import FileManager
+
+
+class TimelineAdviceBase(AdviceBase):
+ class PREPARSE_TYPE:
+ OPTIMIZER = 0
+ STEP = 1
+ OVERLAP_CPT = 2
+ OVERLAP_FREE = 3
+ OVERLAP_CMU = 4
+ ENQUEUE = 5
+ DEQUEUE = 6
+ HOST_TO_DEVICE = 7
+ SYNCHRONIZE = 8
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.trace_view_path = ""
+ self.has_preparse = False
+ self.preparse_data = defaultdict(list)
+ self.entry_map = {
+ 'Computing': self.PREPARSE_TYPE.OVERLAP_CPT,
+ 'Free': self.PREPARSE_TYPE.OVERLAP_FREE,
+ 'AscendCL@aclrtSynchronizeDevice': self.PREPARSE_TYPE.SYNCHRONIZE
+ }
+
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+ if not os.path.exists(self.collection_path):
+ print("[ERROR] Path: {} is not exist.".format(self.collection_path))
+ return False
+ if os.path.isdir(self.collection_path) and self.collection_path.endswith("ascend_pt"):
+ self.trace_view_path = os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT", "trace_view.json")
+ if not os.path.exists(self.trace_view_path):
+ print("[ERROR] trace_view.json is not exist in the Path: {}.".format(os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT")))
+ return False
+ elif os.path.isfile(self.collection_path) and os.path.basename(self.collection_path) == "trace_view.json":
+ self.trace_view_path = self.collection_path
+ else:
+ print("[ERROR] Please input ascend_pt or trace_view.json.")
+ return False
+ print("[INFO] Start to analyse the target file: {}".format(self.trace_view_path))
+ return True
+
+ @abstractmethod
+ def run(self):
+ """
+ analyze profiling data and advice
+ """
+
+ @abstractmethod
+ def output(self):
+ """
+ output relevant data
+ """
+ self.output_format_data[self.DATA] = self.cur_data
+ self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck
+ self.output_format_data[self.ADVICE] = self.cur_advice
+
+ def preparse(self):
+ if self.has_preparse:
+ return
+ json_reader = FileManager.read_json_file(self.trace_view_path)
+ if not isinstance(json_reader, list):
+ return
+ for entry in json_reader:
+ name = entry.get("name", None)
+ if not name:
+ continue
+ if name.startswith("Optimizer.step#") and name.endswith(".step"):
+ self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER].append(entry)
+ elif name.startswith("ProfilerStep#"):
+ self.preparse_data[self.PREPARSE_TYPE.STEP].append(entry)
+ elif name in self.entry_map:
+ self.preparse_data[self.entry_map[name]].append(entry)
+ self.has_preparse = True
diff --git a/profiler/advisor_review/analyzer/__init__.py b/profiler/advisor_review/analyzer/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/base_analyzer.py b/profiler/advisor_review/analyzer/base_analyzer.py
new file mode 100644
index 00000000000..5f4bd3202cd
--- /dev/null
+++ b/profiler/advisor_review/analyzer/base_analyzer.py
@@ -0,0 +1,94 @@
+import logging
+from functools import wraps
+from typing import Dict, List, Union
+from abc import abstractmethod, ABCMeta
+
+from profiler.advisor.common import constant
+from profiler.advisor.common.version_control import VersionControl
+from profiler.advisor.dataset.dataset import Dataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.display.html.render import HTMLRender
+
+logger = logging.getLogger()
+
+
+class BaseAnalyzer(VersionControl, metaclass=ABCMeta):
+ _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION
+
+ dataset_cls_list = []
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs):
+ self.n_processes = n_processes
+ self.cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION)
+ self.torch_version = kwargs.get("torch_version", constant.DEFAULT_TORCH_VERSION)
+ self.html_render = HTMLRender()
+ self.collection_path = collection_path
+ self.kwargs = kwargs
+ self.dataset_list: Dict[str, List[Dataset]] = {}
+ self.init_dataset_list()
+ self.result = OptimizeResult()
+ self.record_list: Dict[str, List] = {}
+
+ @classmethod
+ def check_data(cls, data_list: tuple):
+ """
+ check if all data in data list is contained
+ :param data_list: data list to check
+ :return: func ptr if check success
+ """
+
+ def decorate(func):
+
+ @wraps(func)
+ def wrapper(self, **kwargs):
+ data = self.dataset_list
+ if data is None:
+ return None
+ for data_key in data_list:
+ if data_key not in data:
+ return None
+
+ logger.info("Enable analysis %s with %s", self.__class__.__name__, ",".join(data_list))
+ return func(self)
+
+ return wrapper
+
+ return decorate
+
+ @abstractmethod
+ def optimize(self, **kwargs):
+ pass
+
+ @abstractmethod
+ def make_record(self):
+ pass
+
+ @abstractmethod
+ def make_render(self):
+ pass
+
+ def init_dataset_list(self)->None:
+ dataset_cls_list = self.dataset_cls_list
+ if len(dataset_cls_list) == 0:
+ logger.warning(f"Analyser: %s don't rely on any dataset!", self.__class__.__name__)
+ return
+
+ for dataset_cls in dataset_cls_list:
+ if dataset_cls and callable(dataset_cls):
+ dataset = dataset_cls(collection_path=self.collection_path, data=self.dataset_list, **self.kwargs)
+ key = dataset_cls.get_key()
+ if key not in self.dataset_list:
+ self.dataset_list[key] = []
+ self.dataset_list[key].append(dataset)
+
+ @staticmethod
+ def get_first_data_by_key(data, key) -> Union[Dataset, None]:
+ """
+ get the first member from data with key
+ :param data: input data
+ :param key: data key
+ :return: the first dataset in dataset list
+ """
+ if key in data and len(data[key]) > 0:
+ return data[key][0]
+ return None
diff --git a/profiler/advisor_review/analyzer/cluster/__init__.py b/profiler/advisor_review/analyzer/cluster/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py b/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py
new file mode 100644
index 00000000000..846b79a50f3
--- /dev/null
+++ b/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Dict, List
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.common import constant
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet
+
+
+class SlowLinkAnalyzer(BaseAnalyzer):
+ RDMA_TIME_MS = "RDMA time(ms)"
+ RDMA_SIZE_MB = "RDMA size(mb)"
+ SDMA_TIME_MS = "SDMA time(ms)"
+ SDMA_SIZE_MB = "SDMA size(mb)"
+ RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)"
+ SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)"
+ COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info"
+ TRANSIT_TIME = "Transit Time(ms)"
+ TRANSIT_SIZE = "Transit Size(MB)"
+ SDMA = "SDMA"
+ RDMA = "RDMA"
+ SLOW_LINK_ANALYSIS = "slow_link_analysis"
+ dataset_cls_list = [ClusterCommunicationDataSet]
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs):
+ super().__init__(collection_path, n_processes, **kwargs)
+ key = ClusterCommunicationDataSet.get_key()
+ self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key)
+ self.rank_bw_dict = self.communication_data_class.get_data()
+ self.result = OptimizeResult()
+ self.bottelneck = ''
+ self.suggestion = ''
+ self.format_datas = []
+
+ def optimize(self, **kwargs):
+ if self.rank_bw_dict is None:
+ print("slow_link 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹, \
+ 如不关心这类数据请忽略")
+ return self.result
+ self.process()
+ self.format_datas = self.format_details()
+ self.make_record()
+ self.make_render()
+ return self.result
+
+ def process(self):
+ if self.rank_bw_dict:
+ self.produce_bottleneck(self.RDMA_BANDWIDTH)
+ self.produce_bottleneck(self.SDMA_BANDWIDTH)
+
+ def produce_bottleneck(self, link_type: str):
+ data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()]
+ avg_bw = round(sum(data_list) / len(data_list), 3)
+ if avg_bw == 0:
+ return
+ self.bottelneck += f'{link_type}: \n' \
+ f' The average is {avg_bw}, \n' \
+ f' while the maximum is {round(max(data_list), 3)}GB/s \n' \
+ f' and the minimum is {round(min(data_list), 3)}GB/s. \n' \
+ f' the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n'
+
+ def format_details(self):
+ if not self.rank_bw_dict:
+ return {
+ "headers": [],
+ "data": []
+ }
+
+ details_dict = {}
+ headers = list({k for rank_bw_value in self.rank_bw_dict.values() for k in rank_bw_value.keys()})
+ headers.sort()
+ data_list = [[rank_id] + [rank_bw.get(k, 0) for k in headers] for rank_id, rank_bw in self.rank_bw_dict.items()]
+ data_list.sort(key = lambda x: x[0]) # 按rank_id排序
+
+ details_dict["headers"] = ["rank_id"] + headers
+ details_dict["data"] = data_list
+
+ return details_dict
+
+ def make_record(self):
+ """
+ make record for what and how to optimize
+ """
+ optimization_item = OptimizeItem(
+ SlowLinkAnalyzer.SLOW_LINK_ANALYSIS,
+ self.bottelneck,
+ self.suggestion
+ )
+ self.result.add(OptimizeRecord(optimization_item))
+
+ for i, data in enumerate(self.format_datas["data"]):
+ self.result.add_detail(SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, self.format_datas["headers"], data)
+
+ def make_render(self):
+ result_for_html = {
+ "Description" : self.bottelneck,
+ "suggestion" : self.suggestion,
+ "details" : [self.format_datas]
+ }
+
+ self.html_render.render_template(key="cluster",
+ title=SlowLinkAnalyzer.SLOW_LINK_ANALYSIS,
+ template_dir="templates",
+ template_name="cluster_analysis.html",
+ cann_version=self.cann_version,
+ torch_version=self.torch_version,
+ result=result_for_html)
\ No newline at end of file
diff --git a/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py
new file mode 100644
index 00000000000..4215b514a21
--- /dev/null
+++ b/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Dict, List
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.common import constant
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataSet
+
+
+class SlowRankAnalyzer(BaseAnalyzer):
+ SLOW_RANK_ANALYSIS = "slow_rank_analysis"
+ RANK = "rank"
+ RATIO_THRESHOLD = 0.05
+ BOTTLENECK_LIST = ['Computing', 'Communication', "Free"]
+ dataset_cls_list = [ClusterStepTraceTimeDataSet]
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs):
+ super().__init__(collection_path, n_processes, **kwargs)
+ key = ClusterStepTraceTimeDataSet.get_key()
+ self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key)
+ self.step_trace_dict = self.step_trace_class.get_data()
+ self.result = OptimizeResult()
+ self.bottelneck = ''
+ self.suggestion = ''
+ self.format_datas = []
+
+ def optimize(self, **kwargs):
+ if self.step_trace_dict is None:
+ print("slow_rank 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹 \
+ 如不关心这类数据请忽略")
+ return self.result
+ self.process()
+ self.format_datas = self.format_details()
+ self.make_record()
+ self.make_render()
+ return self.result
+
+ def process(self):
+ total_time_list = [sum(data_tuple) for rank_id, data_tuple in self.step_trace_dict.items()]
+ if total_time_list:
+ mean_total_time = sum(total_time_list) / len(total_time_list)
+ for i in range(len(self.BOTTLENECK_LIST)):
+ self.produce_bottleneck(self.step_trace_dict, i, mean_total_time)
+
+ def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float):
+ data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()]
+ max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time)
+ if max_ratio > self.RATIO_THRESHOLD:
+ self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} \n' \
+ f' has some issues in the cluster, \n' \
+ f' because the max difference of {self.BOTTLENECK_LIST[produce_type]} time \n' \
+ f' has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n'
+
+ def make_record(self):
+ """
+ make record for what and how to optimize
+ """
+ optimization_item = OptimizeItem(
+ SlowRankAnalyzer.SLOW_RANK_ANALYSIS,
+ self.bottelneck,
+ self.suggestion
+ )
+ self.result.add(OptimizeRecord(optimization_item))
+ for i, data in enumerate(self.format_datas["data"]):
+ self.result.add_detail(SlowRankAnalyzer.SLOW_RANK_ANALYSIS, self.format_datas["headers"], data)
+
+ def format_details(self):
+ details_dict = {}
+ headers = ["rank_id", "compute", "communication", "free"]
+ data_list = []
+ for key,value in self.step_trace_dict.items():
+ data_list.append([key] + value)
+ details_dict["headers"] = headers
+ details_dict["data"] = data_list
+ return details_dict
+
+ def make_render(self):
+ result_for_html = {
+ "Description" : self.bottelneck,
+ "suggestion" : self.suggestion,
+ "details" : [self.format_datas]
+ }
+
+ self.html_render.render_template(key="cluster",
+ title=SlowRankAnalyzer.SLOW_RANK_ANALYSIS,
+ template_dir="templates",
+ template_name="cluster_analysis.html",
+ cann_version=self.cann_version,
+ torch_version=self.torch_version,
+ result=result_for_html)
+
+ @staticmethod
+ def compute_max_gap_ratio(data: list, mean: float):
+ if mean == 0:
+ return 0
+ else:
+ return (max(data) - min(data)) / mean
diff --git a/profiler/advisor_review/analyzer/communication/__init__.py b/profiler/advisor_review/analyzer/communication/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/communication/bandwidth/__init__.py b/profiler/advisor_review/analyzer/communication/bandwidth/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/communication/environment/__init__.py b/profiler/advisor_review/analyzer/communication/environment/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/computation/__init__.py b/profiler/advisor_review/analyzer/computation/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/computation/aicpu/__init__.py b/profiler/advisor_review/analyzer/computation/aicpu/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py
new file mode 100644
index 00000000000..4eca1c6c027
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py
@@ -0,0 +1,278 @@
+import copy
+import os
+from functools import partial
+from typing import List, Dict, Optional
+
+import yaml
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker, logger
+from profiler.advisor.analyzer.schedule.fusion_ops.timeline_api_stack_checker import OpStackFinder
+from profiler.advisor.common import constant
+from profiler.advisor.dataset.dataset import Dataset
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+
+
+class AicpuChecker(OperatorChecker):
+ _CHECKER = "aicpu operator"
+ _PROBLEM = "AICPU operator"
+ _MIN_TASK_DURATION = 20
+ _description = f"Some operators and task duration exceed {_MIN_TASK_DURATION} us, such as :\n"
+ _SUGGESTION: List[str] = ["Modify code to avoid aicpu operator"]
+ STACK_INFO_ITEMS = "stack_info"
+ SUGGESTION_INFO_ITEMS = "suggestions"
+ _ITEMS = [
+ "op_name", "op_type", "task_duration", "input_shapes", "input_data_types", "input_formats", "output_shapes",
+ "output_data_types", "output_formats"
+ ]
+
+ def __init__(self, cann_version):
+ super(AicpuChecker, self).__init__(cann_version=cann_version)
+ self.aicpu_rules: Dict = {}
+ self.aicpu_checker: Dict = {}
+ self.load_aicpu_rules()
+
+ def _check_data(self, profiling_data: ProfilingDataset) -> bool:
+ if not self._check_summary(profiling_data):
+ return False
+ return True
+
+ def _check_operator(self, op_info) -> bool:
+ return op_info.task_type == constant.AI_CPU
+
+ def load_aicpu_rules(self, rule_path="rules/aicpu_rules.yaml") -> Dict:
+ if not os.path.isabs(rule_path):
+ rule_path = os.path.join(os.path.dirname(__file__),
+ "../../../", rule_path)
+
+ if not os.path.exists(rule_path):
+ logger.warning("Skip analyze aicpu issues, because %s does not exist.", rule_path)
+ return {}
+ with open(rule_path, 'r') as f:
+ self.aicpu_rules = yaml.safe_load(f)
+ self.filter_aicpu_rules(self.aicpu_rules)
+ for checker_name, check_rule in self.aicpu_rules.items():
+ if not isinstance(check_rule, (list, dict,)):
+ continue
+
+ if checker_name not in AICPU_CHECKER.keys():
+ logger.warning("Skip %s, which is not support now.", checker_name)
+ continue
+
+ self.aicpu_checker[checker_name] = AICPU_CHECKER[checker_name](check_rule)
+
+ def filter_aicpu_rules(self, aicpu_rules):
+ support_checkers = []
+ for checkers in aicpu_rules['CommonChecker']:
+ for key, value in checkers.items():
+ if key == 'DataTypeChecker' and self.cann_version in value['cann_version']:
+ support_checkers.append(checkers)
+ aicpu_rules['CommonChecker'] = support_checkers
+ return
+
+ def check_aicpu_attr(self, op_info) -> List[str]:
+ suggestions = []
+ for _, checker in self.aicpu_checker.items():
+ suggestions.extend(checker.check(op_info))
+ return suggestions
+
+ def check(self, profiling_data: ProfilingDataset) -> bool:
+ """
+ check if any operator need optimize
+ :param profiling_data: profiling datasest
+ :return: true or false
+ """
+
+ if not self._check_data(profiling_data):
+ return False
+ op_summary = profiling_data.op_summary
+
+ def get_opeartor_stack_info(api_stack_finder: OpStackFinder, op_name_list: list) -> list:
+ data: Dict[str, Dataset] = {}
+ event_dataset = TimelineEventDataset(collection_path=profiling_data.collection_path, data=data, task_type=constant.AI_CPU)
+
+ # disable multiprocessing, avoid cost time of enable new process for light task
+ api_stack_finder.get_api_stack_by_op(event_dataset, op_name_list, constant.AI_CPU,
+ disable_multiprocess=True)
+ return api_stack_finder._stack_record
+
+ self._op_list = []
+ total_task_duration = 0.0
+ max_task_duration = 0.0
+ for op_info in op_summary.op_list:
+ if self._check_operator(op_info):
+ self._op_list.append(op_info)
+
+ task_duration = float(op_info.task_duration)
+ total_task_duration += task_duration
+ max_task_duration = max(max_task_duration, task_duration)
+ if (not self._op_list) or (max_task_duration < self._MIN_TASK_DURATION):
+ return False
+
+ # 获取所有算子堆栈的信息
+ op_name_list = []
+ for op in self._op_list:
+ if op.op_name not in op_name_list:
+ op_name_list.append(op.op_name)
+ api_stack_finder = OpStackFinder()
+ stack_record = get_opeartor_stack_info(api_stack_finder, op_name_list)
+
+ # task_id 到 stack 信息的对应
+ self._op_list.sort(key=lambda x: int(x.task_id))
+ stack_record.sort(key=lambda x: x[0])
+ task_id_to_stack = dict()
+ for stack in stack_record:
+ task_id_to_stack[stack[0]] = stack[-1]
+
+ # 算子追加堆栈属性
+ for op in self._op_list:
+ stack = task_id_to_stack.get(int(op.task_id))
+ op.add_attr(self.STACK_INFO_ITEMS, stack)
+ suggestions = self.check_aicpu_attr(op)
+ op.add_attr(self.SUGGESTION_INFO_ITEMS, suggestions)
+
+ # double 类型算子判断
+ double_type_ai_cpu_operator = []
+ for op in self._op_list:
+ if not op.has_attr("input_data_types"):
+ logger.warning(
+ "Skip checking of input data in AICPU checker because of not containing input_data_dtypes in op summary")
+ break
+ if op.has_attr(
+ "input_data_types") and "DOUBLE" in op.input_data_types and op.op_name not in double_type_ai_cpu_operator:
+ double_type_ai_cpu_operator.append(op.op_name)
+ if bool(double_type_ai_cpu_operator):
+ self._SUGGESTION.append("Try to convert double type operator to float, such as {}".format(
+ ",".join(double_type_ai_cpu_operator)))
+ return True
+
+ def make_render(self, html_render, record):
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="operator_ai_cpu.html",
+ format_result=self.format_operator_result(record, constant.OPERATOR_LIST_UNLIMIT))
+
+ def format_operator_result(self, record, limit):
+ """
+ Format operator result to html
+ :param record: profiling check record
+ :param limit: Limit number of operator statistics lists.
+ :return:
+ """
+ optimization_item = record.optimization_item
+ release_suggestion_list = []
+ for suggestion in optimization_item.suggestion:
+ release_suggestion_list.append(suggestion.replace('\n', '
'))
+ logger.debug("suggestion list is %s", release_suggestion_list)
+ format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list),
+ "task_duration": round(record.statistics_item.task_duration, 2)}
+
+ statistic = self.group_by(copy.deepcopy(self._op_list), op_key='op_type',
+ limit=limit)
+ format_result["statistic"] = statistic
+ stack_key_list = ["stack_info", "input_data_types", "output_data_types"]
+ if statistic:
+ for key, info in statistic:
+ op_info_list = self.group_by_list(info.get("op_info_list"), stack_key_list, limit)
+ info["op_info_list"] = op_info_list
+ return format_result
+
+ def group_by_list(self, op_list, op_key_list: List = ["stack_info", "input_data_types", "output_data_types"],
+ limit: int = constant.OPERATOR_LIST_UNLIMIT):
+ if op_list is None:
+ op_list = []
+
+ # op_key_list 合并添加合并的属性,作为 groupby 的 key value
+ op_key = '+'.join(op_key_list) # str, json
+ for op_info in op_list:
+ attribute = ""
+ for _op in op_key_list:
+ if op_info.get_attr(_op):
+ attribute += op_info.get_attr(_op)
+ op_info.add_attr(op_key, attribute)
+
+ return self.group_by(op_list, op_key=op_key, limit=limit)
+
+
+class BaserChecker:
+ def __init__(self, *args, **kwargs):
+ self.checker_list = []
+
+ def build(self):
+ raise NotImplementedError
+
+ def check(self, op_info) -> List[str]:
+ suggestions = []
+ for checker in self.checker_list:
+ suggestion = checker(op_info)
+ if suggestion is not None:
+ suggestions.append(suggestion)
+ return suggestions
+
+
+class CommonChecker(BaserChecker):
+ def __init__(self, check_rules: List[Dict] = None):
+ super(CommonChecker, self).__init__()
+ self.check_rules = check_rules if check_rules is not None else []
+ self.supported_checker = dict(DataTypeChecker=self.datatype_checker)
+ self.build()
+
+ @staticmethod
+ def datatype_checker(check_item: Dict, op_info) -> Optional[str]:
+ supported_op_type = check_item.get('op_type', [])
+ suggestion = check_item.get('suggestion', "")
+ valid_inputs = check_item.get('input', [])
+ valid_outputs = check_item.get('output', [])
+ ignore_type = check_item.get('ignore_type', [])
+ op_type = getattr(op_info, 'op_type', "UNKNOWN")
+ if "__ALL__" in supported_op_type or \
+ op_type.lower() in supported_op_type:
+ if op_type.lower() in ignore_type:
+ return None
+
+ op_input_dtype = getattr(op_info, 'input_data_types', "").split(";")
+ op_input_dtype = [item.lower() for item in op_input_dtype]
+ op_output_dtype = getattr(op_info, 'output_data_types', "").split(";")
+ op_output_dtype = [item.lower() for item in op_output_dtype]
+ input_dtype_diff = set(op_input_dtype).difference(set(valid_inputs))
+ output_dtype_diff = set(op_output_dtype).difference(set(valid_outputs))
+ unsupported_dtype_diff = input_dtype_diff.union(output_dtype_diff)
+ if not unsupported_dtype_diff:
+ return None
+
+ return suggestion.format(",".join(unsupported_dtype_diff).upper(),
+ op_type,
+ ",".join(valid_inputs).upper())
+
+ def build(self):
+ for check in self.check_rules:
+ (check_func, check_rule), = check.items()
+ if check_func not in self.supported_checker:
+ logger.warning("Skip %s, which has not been implemented.", check_func)
+ continue
+ self.checker_list.append(partial(self.supported_checker.get(check_func), check_rule))
+
+
+class ExampleGuideChecker(BaserChecker):
+ def __init__(self, check_rules: List[Dict] = None):
+ super(ExampleGuideChecker, self).__init__()
+ self.check_rules = check_rules if check_rules is not None else []
+ self.build()
+
+ def build(self):
+ def _guide_url(check_item: Dict, op_info) -> Optional[str]:
+ supported_op_type = check_item.get('op_type', [])
+ url = check_item.get('url', "")
+ suggestion = check_item.get('suggestion', "")
+
+ if getattr(op_info, 'op_type', "UNKNOWN").lower() in supported_op_type:
+ return suggestion if "{}" not in suggestion else suggestion.format(url)
+
+ for check in self.check_rules:
+ (_, check_rule), = check.items()
+ self.checker_list.append(partial(_guide_url, check_rule))
+
+
+AICPU_CHECKER = {
+ "CommonChecker": CommonChecker,
+ "ExampleGuideChecker": ExampleGuideChecker
+}
diff --git a/profiler/advisor_review/analyzer/computation/bound/__init__.py b/profiler/advisor_review/analyzer/computation/bound/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py
new file mode 100644
index 00000000000..a7d7ddd93c7
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py
@@ -0,0 +1,75 @@
+import logging
+
+from typing import List
+
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.common import constant
+from profiler.advisor.config.config import Config
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+
+logger = logging.getLogger()
+
+
+class BlockDimChecker(OperatorChecker):
+ _SUGGESTION: List[str] = []
+ _CHECKER = "block dim"
+ _PROBLEM = "block dim"
+ _description = "some operator does not make full use of {} ai core"
+ _ITEMS = [
+ "op_name", "op_type", "task_type", "task_duration", "income", "block_dim", "mix_block_dim", "input_shapes",
+ "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats"
+ ]
+
+ def pre_check(self, profiling_data) -> bool:
+ return not self.is_dynamic_shape(profiling_data)
+
+ def _check_data(self, data):
+ self.format_suggestion_content(data)
+ if not self._check_summary(data):
+ return False
+ if not Config().get_config("ai_core_num"):
+ logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ai core num in info.json file")
+ return False
+ summary = data.op_summary
+ op_info = summary.op_list[0]
+ if not hasattr(op_info, "block_dim"):
+ logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "block dim in op summary")
+ return False
+ if Config().get_config("ai_core_num"):
+ self._aicore_num = int(Config().get_config("ai_core_num"))
+ if Config().get_config("aiv_num"):
+ self._aiv_num = int(Config().get_config("aiv_num"))
+ self._description = self._description.format(self._aicore_num)
+ if self._aiv_num:
+ self._description += f" or {self._aiv_num} ai vector core"
+ self._description += f";\n Top-{OperatorChecker._MAX_TUNE_OP_NUM} operator of " \
+ "task duration are as follows:\n"
+ return True
+
+ def make_render(self, html_render, record):
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="operator_block_dim.html",
+ format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK))
+
+ def _check_operator(self, op_info) -> bool:
+ if op_info.task_type not in ["AI_CORE", "AI_VECTOR_CORE", "MIX_AIC"]:
+ return False
+ block_dim = int(op_info.block_dim)
+ core_num = self.get_core_num(op_info)
+ if block_dim % core_num == 0:
+ return False
+ if op_info.task_type == "MIX_AIC" and hasattr(op_info, "mix_block_dim") \
+ and self._aiv_num and int(op_info.mix_block_dim) % self._aiv_num == 0:
+ return False
+ return True
+
+ def get_core_num(self, op_info):
+ """
+ get core num of task type
+ """
+ if op_info.task_type == "AI_CORE" or not self._aiv_num:
+ core_num = self._aicore_num
+ else:
+ core_num = self._aiv_num
+ return core_num
diff --git a/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py
new file mode 100644
index 00000000000..a22b380f974
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py
@@ -0,0 +1,53 @@
+import logging
+from typing import List
+
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.common import constant
+from profiler.advisor.config.config import Config
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.utils.utils import to_percent
+
+logger = logging.getLogger()
+
+
+class OperatorBoundChecker(OperatorChecker):
+ _MIN_TASK_DURATION = 20 # min task duration 20us
+ _CHECKER = "operator no bound"
+ _PROBLEM = "operator no bound"
+ _SUGGESTION: List[str] = []
+ _description = (
+ f"There is no mte, cube, vector, scalar ratio is more than {to_percent(Config().operator_bound_ratio)};\n" +
+ f"Top task duration operators need to be tuned are as follows: \n")
+ _ITEMS = [
+ "op_name", "op_type", "task_type", "task_duration", "vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio",
+ "mte2_ratio", "mte3_ratio", "block_dim", "input_shapes", "input_data_types", "input_formats", "output_shapes",
+ "output_data_types", "output_formats"
+ ]
+
+ def pre_check(self, profiling_data) -> bool:
+ return not self.is_dynamic_shape(profiling_data)
+
+ def _check_data(self, data):
+ self.format_suggestion_content(data)
+ if not self._check_summary(data):
+ return False
+ for op_info in data.op_summary.op_list:
+ return self._check_operator(op_info)
+
+ logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ratio in op summary")
+ return False
+
+ def _check_operator(self, op_info) -> bool:
+ bound_list = ["vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio", "mte2_ratio", "mte3_ratio"]
+ ratio_list = [self.get_ratio(op_info, attr) for attr in bound_list]
+ if not any(ratio_list):
+ return False # no data, skip check
+ if any(ratio and ratio > Config().operator_bound_ratio for ratio in ratio_list):
+ return False
+ return True
+
+ def make_render(self, html_render, record):
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="operator_no_bound.html",
+ format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK))
diff --git a/profiler/advisor_review/analyzer/computation/op_compile/__init__.py b/profiler/advisor_review/analyzer/computation/op_compile/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py
new file mode 100644
index 00000000000..86d3bac4ff8
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py
@@ -0,0 +1,65 @@
+import copy
+import logging
+from typing import List
+
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.common import constant
+from profiler.advisor.dataset.profiling.info_collection import OpInfo
+from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord
+
+logger = logging.getLogger()
+
+
+class DynamicShapeChecker(OperatorChecker):
+ ENABLE_COMPILED_SUGGESTION = "Optimize by enabling compiled operator, such as:\n" \
+ "`torch_npu.npu.set_compile_mode(jit_compile=False)`\n"
+ _SUGGESTION: List[str] = [ENABLE_COMPILED_SUGGESTION]
+ _CHECKER = "dynamic shape operator"
+ _PROBLEM = "Dynamic shape operator"
+ _description = f"Found all operators are dynamic shape"
+ _op_list: List[OpInfo] = []
+ _tune_op_list: List[str] = [] # record op name to be tuned, and save to tune_ops_file.cfg
+ _op_views: List = []
+
+ def __init__(self, cann_version) -> None:
+ super().__init__(cann_version=cann_version)
+
+ def check(self, profiling_database) -> bool:
+ return self.is_dynamic_shape(profiling_database)
+
+ def make_record(self, profiling_database) -> OptimizeRecord:
+ """
+ make record for what and how to optimize
+ """
+
+ optimization_item = OptimizeItem(
+ self._PROBLEM,
+ self._description,
+ self._SUGGESTION
+ )
+ statistics_item = StatisticsItem("", "", 1)
+ return OptimizeRecord(optimization_item, statistics_item)
+
+ def format_operator_result(self, record, limit=-1):
+ """
+ Format operator result to html
+ :param record: profiling check record
+ :param limit: Limit number of operator statistics lists.
+ :return:
+ """
+ optimization_item = record.optimization_item
+ release_suggestion_list = []
+ for suggestion in optimization_item.suggestion:
+ release_suggestion = copy.deepcopy(suggestion)
+ if release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION:
+ release_suggestion += \
+ f"for details please refer to link : LINK"
+ release_suggestion_list.append(release_suggestion.replace('\n', '
'))
+ format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)}
+ return format_result
+
+ def make_render(self, html_render, record):
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="operator_dynamic_shape.html",
+ format_result=self.format_operator_result(record))
diff --git a/profiler/advisor_review/analyzer/computation/operator_checker.py b/profiler/advisor_review/analyzer/computation/operator_checker.py
new file mode 100644
index 00000000000..0f47650943a
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/operator_checker.py
@@ -0,0 +1,307 @@
+import copy
+import logging
+from textwrap import fill
+from typing import List
+
+from profiler.advisor.common import constant
+from profiler.advisor.common.version_control import VersionControl
+from profiler.advisor.config.config import Config
+from profiler.advisor.dataset.profiling.info_collection import OpInfo
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord
+from profiler.advisor.utils.utils import safe_division
+
+logger = logging.getLogger()
+
+
+class OperatorChecker(VersionControl):
+ _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION
+ _MAX_TUNE_OP_NUM = constant.OPERATOR_OUT_TOPK
+ _MIN_TASK_DURATION = 0
+ _MIN_TASK_DURATION_RATIO = 1.0
+ _MIN_TOTAL_DURATION_RATIO = 1.0
+ _CHECKER = str()
+ _PROBLEM = str()
+ _description = str()
+ STACK_INFO_ITEMS = ""
+ _ITEMS: List[str] = []
+ _SUGGESTION: List[str] = []
+ SKIP_CHECK_MSG = "Skip %s checker because of not containing %s"
+ _tune_op_info_list: List[OpInfo] = []
+ PyTorch_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE, such as:\n" \
+ f"'aoe --job_type=2 --model_path=$user_dump_path " \
+ f"--tune_ops_file={Config().tune_ops_file}'\n"
+ MSLite_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE in mindspore lite framework, such as:\n" \
+ f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \
+ f"--modelFile=$user_model.onnx --outputFile=user_model --configFile=./config.txt\n"
+ _tune_op_list: List[str] = []
+
+ def __init__(self, cann_version: str):
+ self.cann_version = cann_version
+ self._op_list: List[OpInfo] = []
+
+ def check(self, profiling_data: ProfilingDataset) -> bool:
+ """
+ check if any operator need optimize
+ :param profiling_data: profiling datasest
+ :return: true or false
+ """
+ if not self._check_data(profiling_data):
+ return False
+
+ summary = profiling_data.op_summary
+ total_task_duration = 0.0
+ max_task_duration = 0.0
+ for op_info in summary.op_list:
+ if not self._check_operator(op_info):
+ continue
+ task_duration = float(op_info.task_duration)
+ total_task_duration += task_duration
+ max_task_duration = max(max_task_duration, task_duration)
+ self._op_list.append(op_info)
+ if task_duration > self._MIN_TASK_DURATION:
+ self._tune_op_info_list.append(op_info)
+
+ if any([
+ max_task_duration > self._MIN_TASK_DURATION,
+ round(safe_division(max_task_duration, summary.get_total_task_duration()),
+ 4) > self._MIN_TASK_DURATION_RATIO,
+ round(safe_division(total_task_duration, summary.get_total_task_duration()), 4) >
+ self._MIN_TOTAL_DURATION_RATIO,
+ ]):
+ self._op_list.sort(key=lambda x: float(x.get_attr("task_duration")), reverse=True)
+ self._tune_op_info_list.sort(key=lambda x: float(x.get_attr("task_duration")), reverse=True)
+ for op in self._op_list:
+ if op.op_name not in self._tune_op_list and len(self._tune_op_list) < constant.OPERATOR_OUT_TOPK:
+ self._tune_op_list.append(op.op_name)
+ return True
+ return False
+
+ def make_record(self, profiling_data: ProfilingDataset):
+ """
+ Make record for what and how to optimize
+ :param profiling_data: profiling data
+ :return: optimize record
+ """
+ task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list if
+ hasattr(op_info, "get_attr")]
+ total_cost_time = sum(task_duration_list)
+ total_task_duration = profiling_data.op_summary.get_total_task_duration()
+ count = len(task_duration_list)
+ statistics_item = StatisticsItem(total_task_duration, total_cost_time, count, self.get_incomes())
+ optimization_item = OptimizeItem(
+ self._PROBLEM,
+ self._get_description(self._description, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]),
+ self._SUGGESTION
+ )
+ return OptimizeRecord(optimization_item, statistics_item)
+
+ def _get_description(self, description, op_type_list=None):
+ if not op_type_list:
+ return description
+
+ desc_suffix = []
+ for i in range(len(op_type_list)):
+ if i % 3 == 0 and i != 0:
+ desc_suffix.append("\n")
+
+ desc_suffix.append(f"{op_type_list[i]}")
+
+ if i < len(op_type_list) - 1:
+ desc_suffix.append(", ")
+
+ description += "".join(desc_suffix)
+ return description
+
+ def pre_check(self, profiling_data) -> bool:
+ return True
+
+ def is_dynamic_shape(self, profiling_database: ProfilingDataset) -> bool:
+ less_than_cann800_list = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15]
+ # CANN 8.0.0 之前从 ge_info 中获取 op_state 属性,进行动态 shape 逻辑判断
+ if self.cann_version in less_than_cann800_list:
+ if hasattr(profiling_database, "ge_info"):
+ ge_info = profiling_database.ge_info
+ static_shape_operators = ge_info.get_static_shape_operators()
+ if len(static_shape_operators) == 0:
+ return True
+ else:
+ logger.warning(
+ "Skip dynamic shape check because of not containing ge_info.db file in host filefloder.\n"
+ "To enable dynamic shape check, please try to set data_simplification=False in experimental_config.\n"
+ "More details please refer to link : %s", constant.ASCEND_PROFILER_URL)
+ else:
+ # CANN 8.0.0 之后 op_state 属性从 op_summary 文件中获取
+ if hasattr(profiling_database, "op_summary"):
+ static_shape_operators = profiling_database.op_summary.get_static_shape_operators()
+ if len(static_shape_operators) == 0:
+ return True
+ else:
+ logger.warning(
+ "Skip dynamic shape check because of not containing op_summary.csv file in current filefloder."
+ )
+ return False
+
+ def format_operator_result(self, record, limit):
+ """
+ Format operator result to html
+ :param record: profiling check record
+ :param limit: Limit number of operator statistics lists.
+ :return:
+ """
+ optimization_item = record.optimization_item
+ release_suggestion_list = []
+ for suggestion in optimization_item.suggestion:
+ release_suggestion = copy.deepcopy(suggestion)
+ if release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION:
+ release_suggestion += \
+ (f"for details please refer to link : LINK")
+ elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION:
+ release_suggestion += \
+ (f"\nThe config file for MSLite AOE usage is as follows:\n" \
+ f"[ascend_context]\n" \
+ f"aoe_mode=\"operator tuning\"\n" \
+ f"--tune_ops_file={Config().tune_ops_file}\n"
+ f"\nFor details please refer to link : LINK")
+ release_suggestion_list.append(release_suggestion.replace('\n', '
'))
+ format_result = {"record": record.__dict__,
+ "suggestion": fill('
'.join(release_suggestion_list), width=200),
+ "task_duration": round(record.statistics_item.task_duration, 2)}
+ statistic = self.group_by(copy.deepcopy(self._op_list), limit=limit)
+ format_result["statistic"] = statistic
+ return format_result
+
+ def group_by(self, op_list, op_key="op_type",
+ limit: int = constant.OPERATOR_LIST_UNLIMIT):
+ """
+ group by Profiling.OpInfo's attribute key, then return top limit tuple by duration
+ :param op_list: input a OpInfo list
+ :param op_key: group by Profiling.OpInfo's attribute key
+ :param limit: top limit num, if you do not need to limit the length of tuple, input -1(int)
+ :return:
+ """
+ if op_list is None:
+ op_list = []
+ statistic = {} # str, json
+ for op_info in op_list:
+ if statistic.get(op_info.get_attr(op_key)):
+ statistic[op_info.get_attr(op_key)]["summary"]["total_duration"] = float(
+ statistic[op_info.get_attr(op_key)]["summary"]["total_duration"]) + float(
+ op_info.get_attr("task_duration", constant.DEFAULT_DURATION_ZERO))
+ statistic[op_info.get_attr(op_key)]["summary"]["counts"] += 1
+ stack_info = op_info.get_attr("stack_info")
+ if stack_info:
+ op_info.stack_info = stack_info.replace('\r\n', '
')
+ statistic[op_info.get_attr(op_key)]["op_info_list"].append(op_info)
+ else:
+ statistic[op_info.get_attr(op_key)] = {"summary": {}, "op_info_list": []}
+ statistic[op_info.get_attr(op_key)]["summary"]["op_type"] = op_info.get_attr(
+ "op_type", constant.DEFAULT_OPERATOR_TYPE)
+ statistic[op_info.get_attr(op_key)]["summary"]["total_duration"] = float(
+ op_info.get_attr("task_duration", constant.DEFAULT_DURATION_ZERO))
+ statistic[op_info.get_attr(op_key)]["summary"]["counts"] = 1
+ stack_info = op_info.get_attr("stack_info")
+ if stack_info:
+ op_info.stack_info = stack_info.replace('\r\n', '
')
+ statistic[op_info.get_attr(op_key)]["op_info_list"] = [op_info]
+
+ if statistic:
+ for op_key in statistic.keys():
+ statistic[op_key]["summary"]["total_duration"] = round(
+ statistic[op_key]["summary"]["total_duration"], 2)
+ # Grouped by op_type, sorted by total_duration, and obtained the top 10 operators that take the most time.
+ if limit > 0:
+ statistic = sorted(
+ statistic.items(), key=lambda kv: kv[1]["summary"]["total_duration"], reverse=True)[:limit]
+ else:
+ statistic = sorted(statistic.items(), key=lambda kv: kv[1]["summary"]["total_duration"], reverse=True)
+ else:
+ logger.warning("%s checker do not has results to format html", str(self.__class__.__name__))
+ return statistic
+
+ def _check_data(self, profiling_data):
+ return True
+
+ def _check_operator(self, op_info):
+ return False
+
+ def _get_income(self, _op_info: OpInfo) -> float:
+ return 0
+
+ def get_tune_op_list(self):
+ """
+ get tune op list
+ :return: tune op list
+ """
+ return self._tune_op_list
+
+ def get_views(self, _graph_data):
+ """Get node views."""
+ return []
+
+ @classmethod
+ def get_name(cls):
+ """
+ get name of checker
+ :return: checker name
+ """
+ return cls._PROBLEM
+
+ def get_incomes(self) -> float:
+ """get incomes"""
+ incomes = 0.0
+ for op_info in self._op_list:
+ income = self._get_income(op_info)
+ setattr(op_info, "income", round(income, 2))
+ incomes += income
+ return incomes
+
+ def get_op_type_list(self, op_list: List[OpInfo]):
+ """get op type list"""
+ op_type_list = []
+ for op_info in op_list:
+ if op_info.op_type not in op_type_list:
+ op_type_list.append(op_info.op_type)
+ return op_type_list
+
+ def _check_summary(self, data: ProfilingDataset):
+ if not hasattr(data, "op_summary"):
+ logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "op summary")
+ return False
+ return True
+
+ @staticmethod
+ def get_ratio(op_info: OpInfo, attr: str) -> float:
+ if not op_info.has_attr(attr):
+ return 0
+ value = op_info.get_attr(attr)
+ if not value or value == "N/A":
+ return 0
+ return float(value)
+
+ def get_details(self) -> list:
+ """
+ get details of operator to be optimized
+ :return: detail list
+ """
+ op_list = self._op_list
+ if not op_list or not (self._ITEMS + [self.STACK_INFO_ITEMS]):
+ return []
+ details = []
+ attrs = [attr for attr in (self._ITEMS + [self.STACK_INFO_ITEMS]) if op_list[0].has_attr(attr)]
+ details.append(attrs)
+ op_list = sorted(op_list, key=lambda x: float(x.get_attr("task_duration")), reverse=True)
+ for op_info in op_list:
+ content = [
+ op_info.get_attr(attr) if attr != "aicore_time"
+ else op_info.get_float_attr(attr, strict_mode=True) +
+ op_info.get_float_attr("aiv_time", strict_mode=True) for attr in attrs
+ ]
+ details.append(content)
+ return details
+
+ def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None:
+ if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER:
+ self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION)
+ elif profiling_data.PROF_TYPE == constant.MSLITE:
+ self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION)
diff --git a/profiler/advisor_review/analyzer/computation/profiling_analyzer.py b/profiler/advisor_review/analyzer/computation/profiling_analyzer.py
new file mode 100644
index 00000000000..86826177007
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/profiling_analyzer.py
@@ -0,0 +1,89 @@
+import logging
+from abc import ABC
+from typing import Dict, List
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.common import constant
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.analyzer.computation.aicpu.aicpu_checker import AicpuChecker
+from profiler.advisor.analyzer.computation.bound.block_dim_checker import BlockDimChecker
+from profiler.advisor.analyzer.computation.bound.operator_bound_checker import OperatorBoundChecker
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.analyzer.computation.op_compile.dynamic_shape_checker import DynamicShapeChecker
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.utils.utils import get_supported_subclass
+
+logger = logging.getLogger()
+
+
+class ProfilingAnalyzer(BaseAnalyzer, ABC):
+ dataset_cls_list = [ProfilingDataset]
+
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = OperatorChecker(self.cann_version)
+ self.html_render = HTMLRender()
+ self.result = OptimizeResult()
+
+ @BaseAnalyzer.check_data((ProfilingDataset.get_key(),))
+ def optimize(self, **kwargs) -> OptimizeResult:
+ """
+ optimize operator
+ :param data: input datasets
+ :return: result
+ """
+ profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key())
+ checker = self.checker
+ if not checker.pre_check(profiling_data):
+ return self.result
+ if checker.check(profiling_data):
+ # add record
+ record = checker.make_record(profiling_data)
+ checker.make_render(self.html_render, record)
+ self.result.add(record)
+ # add details
+ details = checker.get_details()
+ if details:
+ for i, detail in enumerate(details):
+ if i == 0:
+ # the first row is header
+ self.result.add_detail(checker.get_name(), headers=detail)
+ else:
+ self.result.add_detail(checker.get_name(), detail=detail)
+ # add tune op list
+ tune_op_list = checker.get_tune_op_list()
+ if tune_op_list:
+ self.result.add_tune_op_list(tune_op_list)
+
+ return self.result
+
+ def make_record(self):
+ pass
+
+ def make_render(self):
+ pass
+
+
+class DynamicShapeAnalyzer(ProfilingAnalyzer):
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = DynamicShapeChecker(self.cann_version)
+
+
+class BlockDimAnalyzer(ProfilingAnalyzer):
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = BlockDimChecker(self.cann_version)
+
+
+class OperatorBoundAnalyzer(ProfilingAnalyzer):
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = OperatorBoundChecker(self.cann_version)
+
+class AicpuAnalyzer(ProfilingAnalyzer):
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = AicpuChecker(self.cann_version)
\ No newline at end of file
diff --git a/profiler/advisor_review/analyzer/dataloader/__init__.py b/profiler/advisor_review/analyzer/dataloader/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/graph_fusion/__init__.py b/profiler/advisor_review/analyzer/graph_fusion/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py
new file mode 100644
index 00000000000..326be83b8d4
--- /dev/null
+++ b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py
@@ -0,0 +1,49 @@
+from typing import List
+from functools import partial
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.dataset.graph_dataset import GraphDataset
+from profiler.advisor.analyzer.graph_fusion.graph_fusion_checker import GraphFusionRules
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.display.html.render import HTMLRender
+
+
+class FusionOPAnalyzer(BaseAnalyzer):
+ """
+ fusion optimizer
+ """
+ RULES = dict(graph_dataset=partial(GraphFusionRules, "rules/op_fusion_pass.yaml"))
+ dataset_cls_list = [GraphDataset, ProfilingDataset]
+
+ def __init__(self, collection_path, **kwargs) -> None:
+ super(FusionOPAnalyzer, self).__init__(collection_path, **kwargs)
+ self.result = OptimizeResult()
+ self.html_render = HTMLRender()
+
+ @BaseAnalyzer.check_data((GraphDataset.get_key(),))
+ def optimize(self, **kwargs):
+ """
+ :return: result
+ """
+ self._check(self.dataset_list.get("GraphDataset"), self.dataset_list.get("ProfilingDataset"))
+ return self.result
+
+ def _check(self, graph_data: List[GraphDataset],
+ profiling_data: List[ProfilingDataset] = None) -> None:
+ if len(graph_data) == 0 or graph_data[0].is_empty():
+ return
+ for _, rule in self.RULES.items():
+ checker = rule()
+ if profiling_data is None:
+ checker.find_fusion_matched_issues(graph_data)
+ else:
+ checker.find_fusion_matched_issues_with_times(graph_data, profiling_data)
+ checker.make_record(self.result)
+ checker.make_render(self.html_render)
+
+ def make_record(self):
+ pass
+
+ def make_render(self):
+ pass
diff --git a/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py
new file mode 100644
index 00000000000..e64020fdfe2
--- /dev/null
+++ b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py
@@ -0,0 +1,207 @@
+import logging
+from typing import List
+
+from tqdm import tqdm
+
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord, StatisticsItem
+from profiler.advisor.common.graph.graph import Graph
+from profiler.advisor.common.graph.graph_parser import QueryGraphParser
+from profiler.advisor.dataset.graph_dataset import GraphDataset
+from profiler.advisor.common.graph.graph_match import find_isomorphisms
+
+logger = logging.getLogger()
+
+
+class GraphFusionRules:
+ def __init__(self, fusion_rules: str):
+ self.fusion_rules = fusion_rules
+ self.candidates = []
+ self.task_duration_list = []
+
+ @staticmethod
+ def build_query_graph(query_graphs) -> List[Graph]:
+ for _, query_graph in query_graphs.fusion_rules.items():
+ for sub_graph in query_graph:
+ graph = Graph(*sub_graph)
+ graph.build()
+ yield graph
+
+ def find_fusion_matched_issues(self, graphs: List[GraphDataset]):
+ query_graphs = QueryGraphParser(self.fusion_rules)
+ with tqdm(total=query_graphs.num_rules, leave=False, ncols=100, unit=" rules") as pbar:
+ pbar.set_description(f"Searching Isomorphic Subgraph")
+ for query_graph in self.build_query_graph(query_graphs):
+ query_candidates = find_isomorphisms(query_graph.graph, graphs[0].graphs[-1].graph)
+ pbar.update(1)
+ if len(query_candidates) > 0:
+ self.candidates.append(query_candidates)
+
+ def find_fusion_matched_issues_with_times(self, graphs: List[GraphDataset], profiling):
+ self.find_fusion_matched_issues(graphs)
+ if len(self.candidates) == 0 or len(profiling) == 0:
+ return
+
+ if not hasattr(profiling[0], 'op_summary') or profiling[0].op_summary is None:
+ if hasattr(profiling[0], 'msprof'):
+ self.match_time_from_msprof(profiling[0].msprof)
+ return
+ else:
+ logger.warning("Skip analyze operator because of not containing op summary.")
+ return
+
+ self.match_time_from_summary(profiling[0].op_summary)
+ time_duration_sum = []
+ for task_duration in self.task_duration_list:
+ time_duration_sum.append(sum([sum(duration) for duration in task_duration]))
+ time_duration_index = sorted(range(len(time_duration_sum)),
+ key=time_duration_sum.__getitem__,
+ reverse=True)
+ self.task_duration_list = [self.task_duration_list[i] for i in time_duration_index]
+ self.candidates = [self.candidates[i] for i in time_duration_index]
+
+ def match_time_from_summary(self, op_summary):
+ op_dict = op_summary.task_dict
+ for candidates in self.candidates:
+ candidate_duration = []
+ for candidate in candidates:
+ duration_list = []
+ for node in candidate.values():
+ if node.op_name not in op_dict or op_dict[node.op_name][0].op_type.lower() != node.op_type.lower():
+ logger.warning("Operator %s is missing in op summary, which will be set to 0.", node.op_name)
+ duration_list.append(0.0)
+ continue
+ duration_list.append(float(op_dict[node.op_name][0].task_duration))
+ candidate_duration.append(duration_list)
+ self.task_duration_list.append(candidate_duration)
+
+ def match_time_from_msprof(self, msprof):
+ op_dict = dict()
+ for task in msprof.tasks:
+ if "item_id" not in task.args:
+ continue
+ op_dict[task.args["item_id"]] = {"task_duration": task.dur}
+ for candidates in self.candidates:
+ candidate_duration = []
+ for candidate in candidates:
+ duration_list = []
+ for node in candidate.values():
+ if node.op_name not in op_dict:
+ logger.warning("Operator %s is missing in msprof, which will be set to 0.", node.op_name)
+ duration_list.append(0.0)
+ continue
+ duration_list.append(float(op_dict[node.op_name].get("task_duration")))
+ candidate_duration.append(duration_list)
+ self.task_duration_list.append(candidate_duration)
+
+ def make_render(self, html_render):
+ if not self.candidates:
+ return
+
+ candidates_list = []
+ for case_id, nodes in enumerate(self.candidates):
+ candidate_dict = dict()
+ candidate_dict['counts'] = len(nodes)
+ candidate_dict['matches'] = []
+ has_time_info = False
+ if self.task_duration_list:
+ has_time_info = True
+ candidate_dict['total_duration'] = round(sum(sum(duration) for duration in
+ self.task_duration_list[case_id]), 2)
+ for node_index, refer_node in enumerate(nodes):
+ match = []
+ index = 0
+ pass_name = ','.join(item.op_type for item in refer_node.keys())
+ for query_node, host_node in refer_node.items():
+ fusion_pattern = query_node.op_pass
+
+ if 'op_pass' not in candidate_dict:
+ candidate_dict['op_pass'] = fusion_pattern
+ if 'fusion_pattern' not in candidate_dict:
+ candidate_dict['fusion_pattern'] = pass_name
+ match_attr = dict()
+ match_attr['op_name'] = host_node.op_name
+ match_attr['dtype'] = query_node.op_type
+ if has_time_info:
+ match_attr['duration'] = round(self.task_duration_list[case_id][node_index][index], 2)
+ index += 1
+ match.append(match_attr)
+ match_attr = dict()
+ match_attr['op_name'] = "-"
+ match_attr['dtype'] = "-"
+ if has_time_info:
+ match_attr['duration'] = round(sum(self.task_duration_list[case_id][node_index]), 2)
+ match.append(match_attr)
+ candidate_dict['matches'].append(match)
+ candidates_list.append(candidate_dict)
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="fusion.html",
+ candidates=candidates_list)
+
+ def make_record(self, result: OptimizeResult):
+ """
+ make record for what and how to optimize
+ """
+ if not self.candidates:
+ return
+
+ optimization_item = OptimizeItem(
+ "fusion issue",
+ f"Found {len(self.candidates)} fusion issues",
+ ["Check fusion issues detail in att_advisor*.html"]
+ )
+ total_time = 0.0
+ for candidate in self.task_duration_list:
+ for duration in candidate:
+ total_time += sum(duration)
+ statistics_item = StatisticsItem(0,
+ total_time,
+ sum([len(candidate) for candidate in self.candidates])
+ )
+ result.add(OptimizeRecord(optimization_item, statistics_item))
+
+ record_title = [
+ "issue_id", "graph_name", "op_name", "fusion_structure", "fusion_pattern",
+ "op_type", "input_shape", "input_format",
+ "input_dtype", "output_shape", "output_format", "output_dtype"
+ ]
+ result.add_detail('fusion issues', headers=record_title)
+
+ for case_id, nodes in enumerate(self.candidates):
+ for _, refer_node in enumerate(nodes):
+ pass_name = ','.join(item.op_type for item in refer_node.keys())
+ for query_node, host_node in refer_node.items():
+ fusion_pattern = query_node.op_pass
+ detail = [
+ case_id,
+ host_node.graph_name,
+ host_node.op_name,
+ pass_name,
+ fusion_pattern,
+ query_node.op_type,
+ self.get_attr_shape(host_node, "input", "shape"),
+ self.get_attr_type(host_node, "input", "format"),
+ self.get_attr_type(host_node, "input", "dtype"),
+ self.get_attr_shape(host_node, "output", "shape"),
+ self.get_attr_type(host_node, "output", "format"),
+ self.get_attr_type(host_node, "output", "dtype"),
+ ]
+ result.add_detail('fusion issues', detail=detail)
+
+ @staticmethod
+ def get_attr_shape(node, type_name: str, attr_name: str) -> str:
+ attr_shape = []
+ node_attrs = getattr(node, type_name, [])
+ for attrs in node_attrs:
+ attr = getattr(attrs, attr_name, [])
+ attr_shape.append(",".join(attr))
+ return ";".join(attr_shape)
+
+ @staticmethod
+ def get_attr_type(node, type_name: str, attr_name: str) -> str:
+ attr_type = []
+ node_attrs = getattr(node, type_name, [])
+ for attr in node_attrs:
+ attr_type.append(getattr(attr, attr_name, ""))
+ return ";".join(attr_type)
diff --git a/profiler/advisor_review/analyzer/overall/__init__.py b/profiler/advisor_review/analyzer/overall/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/overall/overall_analyzer.py b/profiler/advisor_review/analyzer/overall/overall_analyzer.py
new file mode 100644
index 00000000000..916a396b3d0
--- /dev/null
+++ b/profiler/advisor_review/analyzer/overall/overall_analyzer.py
@@ -0,0 +1,45 @@
+import logging
+from typing import Dict, List
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.result.result import OptimizeResult
+from profiler.compare_tools.compare_backend.utils.constant import Constant
+from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface
+
+logger = logging.getLogger()
+
+
+class OverallSummaryAnalyzer(BaseAnalyzer):
+
+ def __init__(self, profiling_path, benchmark_profiling_path=None, **kwargs):
+ self.benchmark_profiling_path = benchmark_profiling_path or profiling_path
+ self.profiling_path = profiling_path
+ self.html_render = HTMLRender()
+ self.result = OptimizeResult()
+
+ def optimize(self, **kwargs):
+ compare_result = ComparisonInterface(self.benchmark_profiling_path, self.profiling_path).compare(
+ Constant.OVERALL_COMPARE)
+
+ headers = compare_result.get('Model Profiling Time Distribution').get("headers", [])
+ rows = compare_result.get('Model Profiling Time Distribution').get("rows", [])
+
+ self.make_record()
+ self.make_render(headers=headers, rows=rows)
+ return compare_result
+
+ def make_record(self):
+ pass
+
+ def make_render(self, **kwargs):
+ headers = kwargs.get("headers")
+ rows = kwargs.get("rows")
+
+ if not headers or not rows:
+ logger.info("Empty headers or rows, skip render overall analysis html")
+ self.html_render.render_template(key="overall",
+ template_dir="templates",
+ template_name="overall_analysis.html",
+ headers=kwargs.get("headers"),
+ rows=kwargs.get("rows"))
diff --git a/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py
new file mode 100644
index 00000000000..c74ae051033
--- /dev/null
+++ b/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import copy
+
+import logging
+from typing import Dict, List
+
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.compare_tools.compare_backend.utils.constant import Constant
+from profiler.advisor.common import constant as const
+from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface
+from profiler.advisor.utils.utils import get_file_path_from_directory, load_parameter
+
+
+class OverallSummaryAnalyzer(BaseAnalyzer):
+ OVERALL_SUMMARY_ANALYZER = "overall_summary_analysis"
+ advice_map = {
+ "Computing Time": "if you want more detailed advice please go to att_advisor_*.html",
+ "Uncovered Communication Time": "if you want more detailed advice please go to att_advisor_*.html",
+ "Free Time": "if you want more detailed advice please go to att_advisor_*.html"
+ }
+ time_name_map = {
+ "Computing Time": "computing",
+ "Uncovered Communication Time": "communication",
+ "Free Time": "free",
+ 'Cube Time(Num)': 'Cube Time',
+ 'Vector Time(Num)': 'Vector Time',
+ 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)',
+ 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)',
+ 'Other Time': "Other Computing Time",
+ 'SDMA Time(Num)': 'SDMA Time'
+ }
+ performance_time_dict = {
+ "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)',
+ 'Flash Attention Time(Backward)(Num)', 'Other Time'],
+ "Uncovered Communication Time(Wait Time)": [],
+ "Free Time": ['SDMA Time(Num)']
+ }
+
+ def __init__(self, collection_path: str, n_processes: int = 1, **kwargs):
+ profile_path = get_profile_path(collection_path)
+ super().__init__(profile_path, n_processes, **kwargs)
+ self.base_collection_path = kwargs.get("base_collection_path", "")
+ self._has_base_collection = False
+ self._is_minimal_profiling = False
+ self.cur_data = {}
+ self.cur_data_table = {}
+ self.cur_bottleneck = {}
+ self.cur_advices = ""
+ self._headers = []
+ self._base_data = []
+ self._comparison_data = []
+ self.html_render = HTMLRender()
+ self.result = OptimizeResult()
+ self.bottleneck_str = ""
+ self.bottleneck_table = {}
+
+ @staticmethod
+ def split_duration_and_num(time_value: str) -> tuple:
+ split_data = time_value.split("s") # time value example: 0.229s(1756)
+ duration, num = 0.0, None
+ if len(split_data) >= 2:
+ try:
+ num = int(split_data[1].strip("()"))
+ except ValueError:
+ pass
+ if len(split_data) >= 1:
+ try:
+ duration = float(split_data[0])
+ except ValueError:
+ print(f"[WARNING] Invalid time value: {time_value}.")
+ return duration, num
+
+ @staticmethod
+ def calculate_ratio(dividend, divisor):
+ if not divisor:
+ return float("inf")
+ return dividend / divisor
+
+ def path_check(self):
+ if self.base_collection_path:
+ if os.path.exists(self.base_collection_path):
+ self._has_base_collection = True
+ else:
+ print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.")
+ return os.path.exists(self.collection_path)
+
+ def process(self):
+ base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path
+ result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE)
+ for data in result_data.values():
+ self._headers = data.get("headers", [])
+ rows = data.get("rows", [])
+ if len(rows) == 2:
+ self._base_data = rows[0]
+ self._comparison_data = rows[1]
+ if not self._headers or not self._comparison_data:
+ return
+ self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers
+ if self._has_base_collection:
+ self.cur_data["comparison_result"] = result_data
+ time_category_dict = {}
+ for time_category, time_list in self.performance_time_dict.items():
+ time_value = self.get_time_value(time_category, self._comparison_data)
+ if time_value == Constant.INVALID_VALUE:
+ continue
+ duration, _ = self.split_duration_and_num(time_value)
+ time_category = time_category.split("(")[0]
+ time_category_dict[time_category] = duration
+ self.get_sub_category_time(time_category, time_list, duration)
+ self.cur_data["overall_data"] = time_category_dict
+
+ def get_time_value(self, header_name: str, data_list: list):
+ try:
+ data_index = self._headers.index(header_name)
+ except ValueError:
+ return Constant.INVALID_VALUE
+ try:
+ time_value = data_list[data_index]
+ except IndexError:
+ return Constant.INVALID_VALUE
+ return time_value
+
+ def get_sub_category_time(self, category: str, time_list: list, total_duration: float):
+ sub_time_dict = {}
+ for time_name in time_list:
+ time_value = self.get_time_value(time_name, self._comparison_data)
+ if time_value == Constant.INVALID_VALUE:
+ continue
+ sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, ""))
+ duration, num = self.split_duration_and_num(time_value)
+ sub_time_dict.setdefault(f"Duration(s)", []).append(duration)
+ sub_time_dict.setdefault(f"Duration Ratio", []).append(
+ "{:.2%}".format(self.calculate_ratio(duration, total_duration)))
+ sub_time_dict.setdefault(f"Kernel Number", []).append(num)
+ self.cur_data[self.time_name_map.get(category)] = sub_time_dict
+
+ def identify_bottleneck(self):
+ overall_data = self.cur_data.get("overall_data")
+ if not overall_data:
+ return
+ e2e_time = '%.3f' % sum([data for data in overall_data.values()])
+ overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n"
+ comparison_bottleneck = ""
+ for time_type, time_value in overall_data.items():
+ # add subtype time bottleneck
+ self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n"
+ # add overall bottleneck
+ overall_bottleneck += f" -- {time_type} is {time_value}s\n"
+ if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value,
+ e2e_time) > 0.1:
+ overall_bottleneck += "percentage of free time exceed the threshold 10%."
+ if not self._has_base_collection:
+ continue
+ # add comparison bottleneck
+ time_type_origin = "Uncovered Communication Time(Wait Time)" \
+ if time_type == "Uncovered Communication Time" else time_type
+ base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data))
+ if time_value > base_duration:
+ ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration))
+ comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n"
+ self.cur_bottleneck["overall_data"] = overall_bottleneck
+ if comparison_bottleneck:
+ self.cur_bottleneck["comparison_result"] = comparison_bottleneck
+ def optimize(self, **kwargs):
+ if self.path_check():
+ self.process()
+ self.identify_bottleneck()
+ self.format_bottleneck()
+ self.format_cur_data()
+ self.make_record()
+ self.make_render()
+ return self.result
+
+ def format_bottleneck(self):
+ result = ''
+ headers = []
+ data_list = []
+ data = []
+ for key, value in self.cur_bottleneck.items():
+ if not value:
+ continue
+ result += f'{key}: {value} \n'
+ headers.append(key)
+ data.append(value)
+ data_list.append(data)
+ self.bottleneck_str = result
+ self.bottleneck_table["headers"] = headers
+ self.bottleneck_table["data"] = data_list
+
+ def format_cur_data(self):
+ if not self.cur_data:
+ return
+ for data_type, data in self.cur_data.items():
+ if not data:
+ continue
+ if data_type not in list(self.time_name_map.values()):
+ data_list = list(data.values())
+ else:
+ data_list = [','.join(map(str, value)) for value in data.values()]
+ headers = list(data.keys())
+ data_table = {"headers": headers, "data": [data_list]}
+ self.cur_data_table[data_type] = copy.deepcopy(data_table)
+
+
+ def make_record(self):
+ """
+ make record for what and how to optimize
+ """
+ if not self.bottleneck_str and not self.cur_advices:
+ return
+ optimization_item = OptimizeItem(
+ OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER,
+ self.bottleneck_str,
+ self.cur_advices
+ )
+ self.result.add(OptimizeRecord(optimization_item))
+
+ self.result.add_detail(const.BOTTLENECK, self.bottleneck_table["headers"], self.bottleneck_table["data"][0])
+ for data_type, data_dict in self.cur_data_table.items():
+ if data_dict:
+ self.result.add_detail(const.DATA + data_type, data_dict["headers"], data_dict["data"][0])
+
+ def make_render(self):
+ if not self.bottleneck_str and not self.cur_advices:
+ return
+ result_for_html = {
+ "Description" : self.bottleneck_str,
+ "suggestion" : self.cur_advices,
+ "details" : [self.bottleneck_table]
+ }
+
+ self.html_render.render_template(key="overall",
+ title=OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER,
+ template_dir="templates",
+ template_name="cluster_analysis.html",
+ cann_version=self.cann_version,
+ torch_version=self.torch_version,
+ result=result_for_html)
+
+def get_profile_path(collection_path):
+ for root, dirs, files in os.walk(collection_path):
+ for file in files:
+ if file.startswith("profiler_info"):
+ return root
+ return ""
\ No newline at end of file
diff --git a/profiler/advisor_review/analyzer/schedule/__init__.py b/profiler/advisor_review/analyzer/schedule/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/schedule/dispatch/__init__.py b/profiler/advisor_review/analyzer/schedule/dispatch/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py
new file mode 100644
index 00000000000..0e62a3ff0c8
--- /dev/null
+++ b/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+
+from profiler.advisor.common import constant as const
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.display.html.render import HTMLRender
+
+logger = logging.getLogger()
+
+
+class OpDispatchAnalyzer(BaseAnalyzer):
+ dataset_cls_list = [TimelineEventDataset]
+ """
+ operator dispatch optimizer
+ """
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None:
+ super().__init__(collection_path, n_processes, **kwargs)
+ key = TimelineEventDataset.get_key()
+ self.dataset = self.get_first_data_by_key(self.dataset_list, key)
+ self.result = OptimizeResult()
+ self.html_render = HTMLRender()
+ self._op_compile = None
+ self._issues_record = []
+ self.optimization_item = []
+
+ def optimize(self, **kwargs):
+ """
+ optimize operator
+ :param data: input datasets
+ :return: result
+ """
+ self.get_op_compile_info(self.dataset)
+ self.make_record(self.result)
+ self.make_render(self.html_render)
+ return self.result
+
+ def get_op_compile_info(self, event_dataset: TimelineEventDataset):
+ """
+ :Param event_dataset: dataset of timeline event
+ """
+ if hasattr(event_dataset, "ops_compile"):
+ self._op_compile = getattr(event_dataset, "ops_compile")
+ if not self._op_compile or self._op_compile.total_count < const.MAX_OP_COMPILE_NUM:
+ return
+
+ self._issues_record.append(['operator dispatch',
+ const.OP_COMPILE_ID,
+ self._op_compile.total_count,
+ self._op_compile.total_time])
+ else:
+ logger.debug("Skip operator compile checker, because no op_compile attr find.")
+
+ def make_record(self, result: OptimizeResult):
+ """
+ make record for what and how to optimize
+ """
+ if not self._op_compile or len(self._issues_record) <= 0:
+ return
+ desc = f"Found {self._op_compile.total_count} operator compile issues."
+ suggestion = (f"Please use `torch_npu.npu.set_compile_mode(jit_compile=False)` to disable jit compile "
+ f"in dynamic shape usage.")
+ self.optimization_item.append(OptimizeItem("Operator dispatch", desc, [suggestion]))
+ for optimization in self.optimization_item:
+ result.add(OptimizeRecord(optimization))
+ record_title = ["Issues", "op name", "counts", "total time"]
+ result.add_detail('operator dispatch', headers=record_title)
+ for op_info in self._issues_record:
+ result.add_detail('operator dispatch', detail=op_info)
+
+ def make_render(self, html_render):
+ issues = []
+ optimizations = []
+ for optimization in self.optimization_item:
+ optimizations.append(dict(
+ description=optimization.description,
+ suggestion=optimization.suggestion[0]
+ ))
+ for record in self._issues_record:
+ issues.append(dict(issue=record[0],
+ op_name=record[1],
+ counts=record[2],
+ total_time=record[3]))
+ html_render.render_template(key="schedule",
+ template_dir="templates",
+ template_name="operator_dispatch.html",
+ issues=issues,
+ optimizers=optimizations)
diff --git a/profiler/advisor_review/analyzer/schedule/free_event/__init__.py b/profiler/advisor_review/analyzer/schedule/free_event/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/__init__.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py
new file mode 100644
index 00000000000..c1eb24b8e1e
--- /dev/null
+++ b/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py
@@ -0,0 +1,271 @@
+import multiprocessing
+import logging
+import re
+
+from tqdm import tqdm
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.common import constant as const
+from profiler.advisor.common.analyzer_scopes import SupportedScopes
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.utils.utils import format_timeline_result
+from profiler.advisor.common.timeline.fusion_ops_db import init_timeline_ops_db
+
+logger = logging.getLogger()
+
+
+class TimelineFusionOpsAnalyzer(BaseAnalyzer):
+ dataset_cls_list = [TimelineEventDataset]
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs):
+ super().__init__(collection_path, n_processes, **kwargs)
+ self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict()
+ self.matched_op_stacks = {}
+ self.empty_stacks = True
+ key = TimelineEventDataset.get_key()
+ self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key)
+
+ def optimize(self, **kwargs):
+ for mode in [const.ATEN.lower(), const.OPTIMIZER.lower()]:
+
+ for op_combined, npu_apis in tqdm(getattr(init_timeline_ops_db(self.cann_version, self.torch_version),
+ f"_{mode}_op_api_map").items(), leave=False, ncols=100,
+ desc="Scanning timeline for affinity apis"):
+ for npu_api in npu_apis.split("/"):
+ self.find_fusion_ops(self.timeline_event_dataset, op_combined, npu_api, mode)
+
+ self.query_stack(self.timeline_event_dataset)
+
+ logger.info("Finish timeline analysis")
+ self.make_record()
+ self.make_render()
+ return self.result
+
+ def find_fusion_ops(self, event_dataset, ops: str, npu_api: str, mode: str):
+ """
+ :Param event_dataset: dataset of timeline event
+ :Param ops: operator combination with '-' as separator , e.g. permute-reshape
+ :Param npu_api: api of torch_npu, generally more efficient than torch api
+ :Param mode: aten or dequeue or optimizer
+ :Return: json of op_name and called times and detail stacks
+ """
+ op_rule_pattern, enable_regex = self._format_rule_to_pattern(ops)
+ if not enable_regex:
+ self._match_ops(event_dataset, op_rule_pattern, npu_api, mode)
+ else:
+ try:
+ self._match_ops_with_regex(event_dataset, op_rule_pattern, npu_api, mode)
+ except Exception as e:
+ logger.warning("Failed to find fusion operators with regex %s, reason is %s", ops, e)
+
+ def _match_ops(self, event_dataset, ops: str, npu_api: str, mode: str):
+ """ match operator based on fusion operators rule(without regex),
+ only strictly equals of op name list means matched
+ :Param event_dataset: dataset of timeline event
+ :Param ops: operator combination with '-' as separator , e.g. permute-reshape
+ :Param npu_api: api of torch_npu, generally more efficient than torch api
+ :Param mode: aten or dequeue or optimizer
+ """
+ op_list = ops.split(const.OP_SEP)
+
+ matched_op_index = set()
+ api_ops_matched = False
+
+ for index, event in enumerate(getattr(event_dataset, mode)):
+ if self._replace_op_name_prefix(event.name, mode) != op_list[0]:
+ continue
+ tmp_dequeue_event_names = [self._replace_op_name_prefix(event.name, mode) for event in
+ getattr(event_dataset, mode)[index: index + len(op_list)]]
+ if tmp_dequeue_event_names != op_list:
+ continue
+ api_ops_matched = True
+ matched_op_index.add(event.dataset_index)
+
+ if api_ops_matched:
+ self._matched_op_index[npu_api + f":{ops}"] = matched_op_index
+
+ def _match_ops_with_regex(self, event_dataset, op_rule_pattern: str, npu_api: str,
+ mode: str):
+ """ match operator based on fusion operators rule(with regex),
+ using regex to support condition like 'a = torch.mul(xxx) if xxx else torch.add(xxx)'
+ :Param event_dataset: dataset of timeline event
+ :Param op_rule_pattern: fusion operators rule with regex definition , e.g. add-mul{0,10}, add-mul*
+ :Param npu_api: api of torch_npu, generally more efficient than torch api
+ :Param mode: aten or dequeue or optimizer
+ """
+ matched_op_index = set()
+ total_op_name = "".join([f"{const.OP_SEP}{self._replace_op_name_prefix(event.name, mode)}{const.OP_SEP}"
+ for event in
+ getattr(event_dataset, mode)])
+
+ matched_pattern_index_tuple = [(x.start(0), x.end(0)) for x in re.finditer(op_rule_pattern, total_op_name)]
+ # convert list of index tuple to a whole list: [(3, 25), ...] -> [3, 25, ...]
+ total_ops_split_points = [num for sublist in matched_pattern_index_tuple for num in sublist]
+
+ api_ops_matched = len(total_ops_split_points) != 0
+
+ op_index = []
+ if 0 not in total_ops_split_points:
+ total_ops_split_points = [0] + total_ops_split_points
+ if len(list(total_op_name)) not in total_ops_split_points:
+ total_ops_split_points.append(len(list(total_op_name)))
+
+ # convert total ops name like "-add-mul-xxx-div-" to small pieces like [["add", "mul"], [...], ["div"]]
+ # by the regex index and then calculate the real index for matched fusion operators in event dataset
+ for l, r in zip(total_ops_split_points, total_ops_split_points[1:]):
+ matched_op_flag = True if (l, r) in matched_pattern_index_tuple else False
+ matched_ops_list = total_op_name[l: r].strip(const.OP_SEP).split(const.OP_SEP + const.OP_SEP)
+ op_index.append([matched_op_flag, len(matched_ops_list)])
+ for i, _ in enumerate(op_index):
+ if i > 0:
+ # calculate cumsum for indexing matched operator
+ op_index[i][1] = op_index[i][1] + op_index[i - 1][1]
+ op_index = [[False, 0]] + op_index
+
+ for i, _ in enumerate(op_index):
+ if not op_index[i][0]:
+ continue
+ index = op_index[i - 1][1]
+ matched_op_index.add(index)
+
+ if index > len(getattr(event_dataset, mode)) - 1:
+ continue
+ dataset_index = getattr(event_dataset, mode)[index].get("dataset_index")
+ matched_op_index.add(dataset_index)
+
+ if api_ops_matched:
+ self._matched_op_index[npu_api + f":{op_rule_pattern}"] = sorted(list(matched_op_index))
+
+ def make_record(self):
+ """
+ make record for what and how to optimize
+ """
+ if not self.matched_op_stacks:
+ return
+
+ desc = f"Found {len(format_timeline_result(self.matched_op_stacks))} apis to be replaced" \
+ f" based on the runtime env cann-{self.cann_version} and torch-{self.torch_version}"
+ suggestion = "Please replace training api according to sub table 'Affinity training api'"
+ if self.empty_stacks:
+ desc += ", but with no stack"
+ suggestion = const.TIMELINE_EMPTY_STACKS_PROMPT.format(
+ timeline_profiling_doc_url=const.TIMELINE_WITH_STACK_DOC_URL
+ )
+
+ optimization_item = OptimizeItem(
+ SupportedScopes.TIMELINE_FUSION_OPS,
+ desc,
+ [suggestion]
+ )
+
+ self.result.add(OptimizeRecord(optimization_item))
+
+ record_title = ["Affinity API", "Code stacks", "Stack called counts"]
+ self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, headers=record_title)
+
+ for api_name, stacks_info in format_timeline_result(self.matched_op_stacks).items():
+ if not stacks_info:
+ detail = [api_name, "null", "null"]
+ self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail)
+ else:
+ for stack in stacks_info:
+ detail = [api_name, *stack]
+ self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail)
+
+ def make_render(self):
+ format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True)
+
+ self.html_render.render_template(key="schedule",
+ template_dir="templates",
+ template_name="affinity_api.html",
+ cann_version=self.cann_version,
+ torch_version=self.torch_version,
+ empty_stacks=self.empty_stacks,
+ with_stack_doc_url=const.TIMELINE_WITH_STACK_DOC_URL,
+ api_doc_url=const.TIMELINE_API_DOC_URL,
+ result=format_result_for_html)
+
+ def query_stack(self, event_dataset):
+ if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]):
+ return
+
+ op_stack_list = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index)
+ for op_stack in op_stack_list:
+ for op_rule, stack in op_stack.items():
+ if op_rule not in self.matched_op_stacks:
+ self.matched_op_stacks[op_rule] = {}
+ if stack == const.TIMELINE_FUSION_OPS_NO_STACK_FLAG:
+ continue
+ if stack not in self.matched_op_stacks[op_rule]:
+ self.matched_op_stacks[op_rule][stack] = 0
+ self.matched_op_stacks[op_rule][stack] += 1
+
+ def _query_stack_by_matched_index(self, index, event):
+ stack_record = {}
+ event = TimelineEvent(event)
+
+ matched_op_rules = []
+ for op_rule, matched_index in self._matched_op_index.items():
+ if index not in matched_index:
+ continue
+
+ matched_op_rules.append(op_rule)
+ stack = event.args.get(const.CALL_STACKS)
+
+ if not stack:
+ logger.debug("Got empty '%s' for event %s", const.CALL_STACKS, event)
+ continue
+
+ if self.empty_stacks and stack:
+ self.empty_stacks = False
+
+ stack_record[op_rule] = stack
+
+ if matched_op_rules and not stack_record:
+ for op_rule in matched_op_rules:
+ stack_record[op_rule] = const.TIMELINE_FUSION_OPS_NO_STACK_FLAG
+
+ return stack_record
+
+ def _replace_op_name_prefix(self, event_name, mode):
+ if mode == const.DEQUEUE.lower():
+ op_name_prefix = f"{const.DEQUEUE}{const.DEQUEUE_SEP}"
+ elif mode == const.ATEN:
+ op_name_prefix = f"{const.ATEN}{const.ATEN_SEP}"
+ else:
+ op_name_prefix = f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}"
+
+ return event_name.replace(op_name_prefix, "")
+
+ def _format_rule_to_pattern(self, op_rule):
+ """
+ Args:
+ op_rule: like (mul){0,1}-(add|neg){0,2}-dropout-(softmax)*
+
+ Returns: op_pattern like (-mul-){0,1}(-add-|-neg-){0,2}(-dropout-)(-softmax-)*
+ """
+ enable_regex = False
+ if "(" not in op_rule and ")" not in op_rule:
+ # op_rule which requires fuzzy matching mush consist of "()"
+ return op_rule, enable_regex
+
+ enable_regex = True
+ op_pattern_list = op_rule.split(const.OP_SEP)
+ format_op_pattern = ""
+ for op_pattern in op_pattern_list:
+ matched_res = re.search(r'\((.*?)\)', op_pattern)
+
+ ops_index_range = (matched_res.start() + 1, matched_res.end() - 1) if matched_res else (
+ 0, len(op_pattern))
+
+ op_names = op_pattern[ops_index_range[0]: ops_index_range[1]]
+ tmp_op_names_record = []
+ for op_name in op_names.split("|"):
+ tmp_op_names_record.append(f"{const.OP_SEP}{op_name.strip(' ')}{const.OP_SEP}")
+ op_suffix = op_pattern[ops_index_range[1] + 1:]
+ op_names_format = f"({'|'.join(tmp_op_names_record)}){op_suffix}"
+
+ format_op_pattern += op_names_format
+ return format_op_pattern, enable_regex
diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py
new file mode 100644
index 00000000000..f684a489211
--- /dev/null
+++ b/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py
@@ -0,0 +1,163 @@
+import logging
+from typing import List
+
+from profiler.advisor.common import constant as const
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.utils.utils import get_analyze_processes, ParallelJob
+
+logger = logging.getLogger()
+
+
+class OpStackFinder:
+
+ def __init__(self):
+ self.n_processes = get_analyze_processes()
+ self._stack_record = []
+ self._task_id_record = {}
+ self.op_name = None
+ self.task_type = None
+ self.matched_index = set()
+
+ def get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: List[str] = None, task_type: str = None,
+ disable_multiprocess=False):
+ """
+ :Param event_dataset: dataset of timeline event
+ :Param op_name: operator name, e.g. IndexPutV2
+ :Param task_type: operator task type, optionals are AI_CPU and AI_CORE
+ :Param disable_multiprocess: disable multiprocessing, avoid cost time of enable new process for light task
+ """
+ if not op_name:
+ op_name = []
+ if not isinstance(op_name, list):
+ op_name = [op_name]
+
+ self.op_name = ",".join(op_name)
+ self.task_type = task_type
+ op_name_list = event_dataset.task_op_names if not op_name else op_name
+
+ if self.n_processes <= 1 or disable_multiprocess:
+ self._query_stacks_multiprocess(event_dataset, op_name_list, task_type)
+ else:
+ event_num_per_process = int(len(op_name_list) / self.n_processes) + 1
+ parallel_analyzer = ParallelJob(
+ self._query_stacks_multiprocess,
+ [[event_dataset, op_name_list[i:i + event_num_per_process], task_type]
+ for i in range(0, len(op_name_list), event_num_per_process)],
+ job_name="Analyzing operator stacks from timeline"
+ )
+ parallel_analyzer.start(self.n_processes)
+ self.query_stack(event_dataset)
+
+ def make_record(self, result: OptimizeResult):
+ """
+ make record for what and how to optimize
+ """
+ if not self._stack_record:
+ return
+
+ desc = f"Found {len(self._stack_record)} called stacks for"
+ if self.op_name and self.task_type:
+ desc += f" operators with name '{self.op_name}' with task type '{self.task_type}'"
+ elif self.op_name and not self.task_type:
+ desc += f" operators with name '{self.op_name}'"
+ elif self.task_type and not self.op_name:
+ desc += f" operators with task type '{self.task_type}'"
+ else:
+ desc += " all operators"
+
+ suggestion = f"Please use command 'ma-advisor analyze profiling' to analyze operators"
+ optimization_item = OptimizeItem(
+ "Operator stacks",
+ desc,
+ [suggestion]
+ )
+ result.add(OptimizeRecord(optimization_item))
+
+ record_title = ["Task ID", "op name", "op type", "code stacks"]
+ result.add_detail('operator stacks', headers=record_title)
+
+ for op_info in self._stack_record:
+ result.add_detail('operator stacks', detail=op_info)
+
+ def _get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: str, task_type: str):
+ for _, src_op_event in event_dataset.ops_with_task_type.items():
+
+ op_task_type = src_op_event.get(const.TASK_TYPE)
+ if not (src_op_event.name == op_name and op_task_type and op_task_type == task_type):
+ continue
+
+ torch_to_npu_key = f"s-{src_op_event.tid}-{src_op_event.ts}"
+ torch_to_npu_event = event_dataset.torch_to_npu.get(torch_to_npu_key) or event_dataset.torch_to_npu.get(
+ f"s-{src_op_event.ts}") or event_dataset.torch_to_npu.get(f"s-{src_op_event.ts.replace('.', '')}")
+
+ acl_to_npu_event = src_op_event.ts in event_dataset.acl_to_npu
+
+ if not torch_to_npu_event and not acl_to_npu_event:
+ continue
+
+ # query stack by torch_to_npu first, due to each operator had acl_to_npu incoming flow in cann6.3
+ if torch_to_npu_event:
+ dst_op_index = self._query_index_by_torch_to_npu(event_dataset, torch_to_npu_event)
+ else:
+ dst_op_index = self._query_index_by_acl_to_npu(acl_to_npu_event)
+
+ if not dst_op_index:
+ continue
+
+ task_id = src_op_event.task_id
+ if not task_id:
+ continue
+ self.matched_index.add(dst_op_index)
+ if dst_op_index not in self._task_id_record:
+ self._task_id_record[dst_op_index] = []
+ self._task_id_record[dst_op_index].append([task_id, op_name, task_type])
+
+ def _query_index_by_torch_to_npu(self, event_dataset, torch_to_npu_event):
+ dst_op_event_key = torch_to_npu_event.ts
+ dst_op_event = event_dataset.ops_with_stack.get(dst_op_event_key)
+
+ if not dst_op_event:
+ return const.TIMELINE_BACKWARD_NO_STACK_CODE
+
+ return dst_op_event.get("dataset_index")
+
+ def _query_index_by_acl_to_npu(self, acl_to_npu_event):
+ if acl_to_npu_event:
+ return const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE
+
+ def _query_stacks_multiprocess(self, event_dataset, op_name_list, task_type):
+
+ for op_name in op_name_list:
+ if task_type is not None:
+ self._get_api_stack_by_op(event_dataset, op_name, task_type)
+ else:
+ self._get_api_stack_by_op(event_dataset, op_name, const.AI_CORE)
+ self._get_api_stack_by_op(event_dataset, op_name, const.AI_CPU)
+
+ def _format_stack_record(self):
+ stack_list = []
+ for task_id, stack_info in self._task_id_record.items():
+ stack_list.append([task_id, *stack_info])
+ return stack_list
+
+ def _query_stack_by_matched_index(self, index, event):
+ if index not in self.matched_index:
+ return None
+ event = TimelineEvent(event)
+ stack = event.args.get(const.CALL_STACKS)
+ stack = stack if stack else const.NO_STACK_REASON_MAP.get(const.TIMELINE_BACKWARD_NO_STACK_CODE)
+ for matched_op_info in self._task_id_record.get(index, []):
+ self._stack_record.append([*matched_op_info, stack])
+
+ for matched_op_info in self._task_id_record.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE, []):
+ self._stack_record.append([*matched_op_info,
+ const.NO_STACK_REASON_MAP.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE)])
+ return None
+
+ def query_stack(self, event_dataset: TimelineEventDataset):
+ if not event_dataset.dataset_len:
+ return
+ _ = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index)
diff --git a/profiler/advisor_review/cluster_perf_analysis.ipynb b/profiler/advisor_review/cluster_perf_analysis.ipynb
new file mode 100644
index 00000000000..7ee0b24e854
--- /dev/null
+++ b/profiler/advisor_review/cluster_perf_analysis.ipynb
@@ -0,0 +1,1042 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "initial_id",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-21T13:31:25.022339600Z",
+ "start_time": "2023-11-21T13:31:25.016155200Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.append(\"../..\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "c552da9d-36f9-43d3-ae1f-c54f78d3ff2d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from profiler.advisor.interface.interface import Interface\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from prettytable import PrettyTable, ALL\n",
+ "from textwrap import fill"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "57d17a21205c3c5e",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "source": [
+ "# 集群调优分析\n",
+ "## 1. 集群分析的数据准备\n",
+ "首先我们当前支持PyTorch多卡大模型的集群分析,您需要输入集群分析的profiling_path路径,例如: \n",
+ "--{profiling_path} \n",
+ " -- xxxx_ascend_pt \n",
+ " -- xxxx_ascend_pt \n",
+ " -- xxxx_ascend_pt \n",
+ " ...... \n",
+ " -- xxxx_ascend_pt \n",
+ "里面每张卡的profiling文件都是ascend_pt结尾的文件。 \n",
+ "\n",
+ "## 2. 集群分析解决的问题 \n",
+ "当前的功能主要有四项: \n",
+ "1). 识别多卡间的计算慢卡(根据计算时间等推断) \n",
+ "2). 识别多卡间的通信慢现象(根据通信链路的带宽判断) \n",
+ "3). 对多卡间的计算算子进行统计展示(识别不同卡的算子差异) \n",
+ "4). 展示集群流水并行图(根据时间轴展示多卡间的计算和通信时间) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "36b7a24cc7ca5da2",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-21T12:53:38.379699800Z",
+ "start_time": "2023-11-21T12:53:38.363755900Z"
+ },
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# EDIT THE PROFILING DATA PATH\n",
+ "cluster_path = r\"YOUR PROFILING PATH\"\n",
+ "interface = Interface(profiling_path=cluster_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cf832ac2e0dfa30f",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "source": [
+ "## 1) 识别慢卡"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "40aac93278dd6e34",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-21T12:53:41.815599700Z",
+ "start_time": "2023-11-21T12:53:41.783393700Z"
+ },
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[INFO]Cluster has been analyzed because of the existence of cluster analysis output directory.\n",
+ "[INFO]Skip Cluster analyze backend.\n"
+ ]
+ }
+ ],
+ "source": [
+ "slow_rank_result = interface.get_result(\"cluster\", \"slow_rank\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "0e943b2a-37a6-4db6-9e70-235d397f1d39",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
rank_id | \n", + "compute | \n", + "communication | \n", + "free | \n", + "
---|---|---|---|
0 | \n", + "28976239.07999987 | \n", + "7586795.419999811 | \n", + "6836641.679994211 | \n", + "
1 | \n", + "29012279.100000102 | \n", + "6984613.220000025 | \n", + "7388343.859991224 | \n", + "
2 | \n", + "29019115.32300051 | \n", + "7489956.633000028 | \n", + "6881360.253991371 | \n", + "
3 | \n", + "29027089.560000077 | \n", + "7963312.239999794 | \n", + "6389981.899993688 | \n", + "
4 | \n", + "29044786.93699965 | \n", + "6533618.639000017 | \n", + "7780517.1539908135 | \n", + "
5 | \n", + "29178186.259999853 | \n", + "7925184.420000028 | \n", + "6286867.999995028 | \n", + "
6 | \n", + "29025331.189999904 | \n", + "6386639.90799992 | \n", + "7941798.704992032 | \n", + "
7 | \n", + "29056803.304999545 | \n", + "7234444.826000024 | \n", + "7094608.035991492 | \n", + "
8 | \n", + "31383314.980000228 | \n", + "3973806.6169999996 | \n", + "8017981.379989724 | \n", + "
9 | \n", + "31360536.36200019 | \n", + "4757458.825000002 | \n", + "7277062.386991671 | \n", + "
10 | \n", + "31381891.800000463 | \n", + "5276870.359999998 | \n", + "6731073.659992552 | \n", + "
11 | \n", + "31387777.38000033 | \n", + "4727362.3000000045 | \n", + "7297578.339992355 | \n", + "
12 | \n", + "31374132.74499977 | \n", + "5164443.388000004 | \n", + "6829798.933991944 | \n", + "
13 | \n", + "31377800.178999804 | \n", + "4360616.283000001 | \n", + "7624691.509991412 | \n", + "
14 | \n", + "31374658.360000316 | \n", + "4457099.620000001 | \n", + "7542724.319990785 | \n", + "
15 | \n", + "31387255.527000006 | \n", + "5000860.905 | \n", + "6975264.115991174 | \n", + "
problem | \n", + "description | \n", + "
---|---|
slow_rank_analysis | \n", + "Computing has some issues in the cluster, because the max difference of Computing time has reached 2411.538ms. Communication has some issues in the cluster, because the max difference of Communication time has reached 3989.506ms. | \n",
+ "
rank_id | \n", + "RDMA bandwidth(GB/s) | \n", + "RDMA size(mb) | \n", + "RDMA time(ms) | \n", + "SDMA bandwidth(GB/s) | \n", + "SDMA size(mb) | \n", + "SDMA time(ms) | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.7668 | \n", + "42507.3469439998 | \n", + "4352.225880000002 | \n", + "
1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "10.1653 | \n", + "42507.346775999795 | \n", + "4181.611080000001 | \n", + "
2 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "10.471 | \n", + "42507.346775999795 | \n", + "4059.527798999999 | \n", + "
3 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.9691 | \n", + "42507.346775999795 | \n", + "4263.9230400000015 | \n", + "
4 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.1469 | \n", + "42507.346775999795 | \n", + "4647.202435000001 | \n", + "
5 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.4663 | \n", + "42507.346775999795 | \n", + "4490.373999999999 | \n", + "
6 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.5692 | \n", + "42507.346775999795 | \n", + "4442.106745000001 | \n", + "
7 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.8444 | \n", + "42507.346775999795 | \n", + "4317.931616999999 | \n", + "
8 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.895 | \n", + "42507.389952 | \n", + "2249.662369 | \n", + "
9 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.9112 | \n", + "42507.39080800006 | \n", + "2247.7420159999997 | \n", + "
10 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.7713 | \n", + "42507.39080800006 | \n", + "2264.48576 | \n", + "
11 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.8389 | \n", + "42507.39080800006 | \n", + "2256.3606000000004 | \n", + "
12 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.7687 | \n", + "42507.39080800006 | \n", + "2264.8021099999996 | \n", + "
13 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.9717 | \n", + "42507.39080800006 | \n", + "2240.5713950000004 | \n", + "
14 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.9226 | \n", + "42507.39080800006 | \n", + "2246.381839999999 | \n", + "
15 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.8346 | \n", + "42507.39080800006 | \n", + "2256.8781 | \n", + "
problem | \n", + "description | \n", + "
---|---|
slow_rank_analysis | \n", + "Computing has some issues in the cluster, because the max difference of Computing time has reached 2411.538ms. Communication has some issues in the cluster, because the max difference of Communication time has reached 3989.506ms. | \n",
+ "
slow_link_analysis | \n", + "SDMA bandwidth(GB/s): The average is 14.332, while the maximum is 18.972GB/s and the minimum is 9.147GB/s. the difference is 9.825GB/s. | \n",
+ "
\n", + " | rank id | \n", + "Name | \n", + "Input Shapes | \n", + "Input Data Types | \n", + "Output Shapes | \n", + "Duration(us)_mean | \n", + "Duration(us)_var | \n", + "Duration(us)_max | \n", + "Duration(us)_min | \n", + "Duration(us)_count | \n", + "Duration(us)_sum | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "Add100 | \n", + "\"4096,10880;4096,10880\" | \n", + "FLOAT;FLOAT | \n", + "\"4096,10880\" | \n", + "478.210918 | \n", + "237.729252 | \n", + "721.420 | \n", + "449.80 | \n", + "1024 | \n", + "489687.980 | \n", + "
1 | \n", + "0 | \n", + "Add102 | \n", + "\"21760;21760\" | \n", + "FLOAT;FLOAT | \n", + "\"21760\" | \n", + "4.390391 | \n", + "0.011915 | \n", + "4.820 | \n", + "3.98 | \n", + "1024 | \n", + "4495.760 | \n", + "
2 | \n", + "0 | \n", + "Add106 | \n", + "\"21760,4096;21760,4096\" | \n", + "FLOAT;FLOAT | \n", + "\"21760,4096\" | \n", + "933.504395 | \n", + "462.979321 | \n", + "1257.140 | \n", + "927.38 | \n", + "1024 | \n", + "955908.500 | \n", + "
3 | \n", + "0 | \n", + "Add111 | \n", + "\"4096,4096;4096,4096\" | \n", + "FLOAT;FLOAT | \n", + "\"4096,4096\" | \n", + "91.267363 | \n", + "2.158275 | \n", + "97.120 | \n", + "85.12 | \n", + "1024 | \n", + "93457.780 | \n", + "
4 | \n", + "0 | \n", + "Add118 | \n", + "\"12288,4096;12288,4096\" | \n", + "FLOAT;FLOAT | \n", + "\"12288,4096\" | \n", + "526.312012 | \n", + "1462.617511 | \n", + "787.780 | \n", + "424.24 | \n", + "1024 | \n", + "538943.500 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
2513 | \n", + "15 | \n", + "trans_Cast_12 | \n", + "\"4096,1,1,128\" | \n", + "FLOAT | \n", + "\"4096,1,1,128\" | \n", + "8.486495 | \n", + "0.060174 | \n", + "9.820 | \n", + "8.20 | \n", + "2048 | \n", + "17380.342 | \n", + "
2514 | \n", + "15 | \n", + "trans_Cast_13 | \n", + "\"4096,1,1,128\" | \n", + "FLOAT | \n", + "\"4096,1,1,128\" | \n", + "10.534564 | \n", + "0.166380 | \n", + "12.900 | \n", + "9.48 | \n", + "2048 | \n", + "21574.787 | \n", + "
2515 | \n", + "15 | \n", + "trans_Cast_14 | \n", + "\"4096,1,1,128\" | \n", + "FLOAT | \n", + "\"4096,1,1,128\" | \n", + "9.784551 | \n", + "0.295368 | \n", + "13.021 | \n", + "8.56 | \n", + "2048 | \n", + "20038.761 | \n", + "
2516 | \n", + "15 | \n", + "trans_Cast_15 | \n", + "\"4096,1,1,128\" | \n", + "DT_BF16 | \n", + "\"4096,1,1,128\" | \n", + "8.342211 | \n", + "0.120471 | \n", + "10.220 | \n", + "7.86 | \n", + "2048 | \n", + "17084.848 | \n", + "
2517 | \n", + "15 | \n", + "trans_Cast_16 | \n", + "\"4096,1,1,128\" | \n", + "DT_BF16 | \n", + "\"4096,1,1,128\" | \n", + "9.507589 | \n", + "0.117111 | \n", + "11.681 | \n", + "9.18 | \n", + "2048 | \n", + "19471.543 | \n", + "
2518 rows × 11 columns
\n", + "problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_type | \n", + "task_duration | \n", + "income | \n", + "block_dim | \n", + "mix_block_dim | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.76 | \n", + "0 | \n", + "16 | \n", + "0 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.24 | \n", + "0 | \n", + "16 | \n", + "0 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/lm_head-Linear/MatMul-op213 | \n", + "MatMulV2 | \n", + "AI_CORE | \n", + "39.02 | \n", + "0 | \n", + "20 | \n", + "0 | \n", + ""128,128;128,32000" | \n", + "FLOAT16;FLOAT16 | \n", + "FORMAT_ND;FORMAT_ND | \n", + ""128,32000" | \n", + "FLOAT | \n", + "FORMAT_ND | \n", + "
problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
operator no bound | \n", + "There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 95 | \n", + "814.0199999999999 | \n", + "0.7985 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_type | \n", + "task_duration | \n", + "vec_ratio | \n", + "mac_ratio | \n", + "scalar_ratio | \n", + "mte1_ratio | \n", + "mte2_ratio | \n", + "mte3_ratio | \n", + "block_dim | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.76 | \n", + "0.4654 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0056 | \n", + "16 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.24 | \n", + "0.466 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0062 | \n", + "16 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/lm_head-Linear/MatMul-op213 | \n", + "MatMulV2 | \n", + "AI_CORE | \n", + "39.02 | \n", + "0 | \n", + "0.1105 | \n", + "0.0119 | \n", + "0.0857 | \n", + "0.4284 | \n", + "0 | \n", + "20 | \n", + ""128,128;128,32000" | \n", + "FLOAT16;FLOAT16 | \n", + "FORMAT_ND;FORMAT_ND | \n", + ""128,32000" | \n", + "FLOAT | \n", + "FORMAT_ND | \n", + "
problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
operator no bound | \n", + "There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 95 | \n", + "814.0199999999999 | \n", + "0.7985 | \n", + "\n", + " | \n", + " |
AICPU operator | \n", + "Some operators and task duration exceed 20 us, such as : Cast | \n", + "1. Modify code to avoid aicpu operator | \n", + "39 | \n", + "686568.860000001 | \n", + "0.0189 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_duration | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "stack_info | \n", + "
---|---|---|---|---|---|---|---|---|---|
trans_Cast_5 | \n", + "Cast | \n", + "493.64 | \n", + """ | \n", + "INT32 | \n", + "FORMAT_ND | \n", + """ | \n", + "UINT64 | \n", + "FORMAT_ND | \n", + "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): dropout; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/dropout.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/module.py(184): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; ../../pretrain_gpt.py(88): forward_step; /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; /home/s30040711/Megatron- LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(96): forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): train_step; /profiling_auto_GPT3/megatron/training.py(837): train; /profiling_auto_GPT3/megatron/training.py(152): pretrain; ../../pretrain_gpt.py(122): <module> | \n",
+ "
trans_Cast_5 | \n", + "Cast | \n", + "413.4 | \n", + """ | \n", + "INT32 | \n", + "FORMAT_ND | \n", + """ | \n", + "UINT64 | \n", + "FORMAT_ND | \n", + "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): dropout; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/dropout.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/module.py(184): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; ../../pretrain_gpt.py(88): forward_step; /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; /home/s30040711/Megatron- LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(109): forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): train_step; /profiling_auto_GPT3/megatron/training.py(837): train; /profiling_auto_GPT3/megatron/training.py(152): pretrain; ../../pretrain_gpt.py(122): <module> | \n",
+ "
{{ header }} | + {% endfor %} +|
{{ element|round(2) }} | + {% else %} +{{ element }} | + {% endif %} + {% endfor %} +
{{ header }} | + {% endfor %} +|
{{ element|round(2) }} | + {% else %} +{{ element }} | + {% endif %} + {% endfor %} +
Structure | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ node.fusion_pattern|safe }} | +{{ node.counts|safe }} | +{{ node.total_duration|safe }} | +
OP Name | +OP Type | +Elapsed Time(us) | +
---|---|---|
{{ node.op_name|safe }} | +{{ node.dtype|safe }} | +{{ node.duration|safe }} | +
Description | +Suggestion | +Elapsed Time(us) | +Time Ratio | +
---|---|---|---|
{{ format_result.record.optimization_item.description|safe }} | +{{ format_result.suggestion|safe }} | +{{ format_result.task_duration|safe }} | +{{ format_result.record.statistics_item.task_duration_ratio|safe }} | +
Operator Type | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ op_info.summary.op_type|safe }} | +{{ op_info.summary.counts|safe }} | +{{ op_info.summary.total_duration|safe }} | +
+ Suggestion {{ loop.index|safe }}: {{suggestion|safe}} +
+ {% endfor %} +Suggestion 1: Modify code to avoid AICPU operator
+ {% endif %} + + {{ info.op_info_list[0].stack_info|safe }} +Description | +Suggestion | +Elapsed Time(us) | +Time Ratio | +
---|---|---|---|
{{ format_result.record.optimization_item.description|safe }} | +{{ format_result.suggestion|safe }} | +{{ format_result.task_duration|safe }} | +{{ format_result.record.statistics_item.task_duration_ratio|safe }} | +
Operator Type | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ op_info.summary.op_type|safe }} | +{{ op_info.summary.counts|safe }} | +{{ op_info.summary.total_duration|safe }} | +
Description | +Suggestion | +
---|---|
{{ optimizer.description |safe }} | +{{ optimizer.suggestion|safe }} | +
Issue | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ issue.op_name |safe }} | +{{ issue.counts |safe }} | +{{ issue.total_time |safe }} | +
Description | +Suggestion | +
---|---|
{{ format_result.record.optimization_item.description|safe }} | +{{ format_result.suggestion|safe }} | +
Description | +Suggestion | +Elapsed Time(us) | +Time Ratio | +
---|---|---|---|
{{ format_result.record.optimization_item.description|safe }} | +{{ format_result.suggestion|safe }} | +{{ format_result.task_duration|safe }} | +{{ format_result.record.statistics_item.task_duration_ratio|safe }} | +
Operator Type | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ op_info.summary.op_type|safe }} | +{{ op_info.summary.counts|safe }} | +{{ op_info.summary.total_duration|safe }} | +
{{ header }} | + {% endfor %} +
{{ element }} | + {% endfor %} +
problem | \n", + "description | \n", + "suggestion | \n", + "
---|---|---|
timeline_fusion_ops | \n", + "Found 2 apis to be replaced based on the runtime env cann-8.0.0 and torch-2.1.0 | \n", + "1. Please replace training api according to sub table 'Affinity training api' | \n", + "
Affinity API | \n", + "Code stacks | \n", + "Stack called counts | \n", + "
---|---|---|
optimizer.clip_grad_norm_fused_ | \n", + "/home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- packages/torch/nn/utils/clip_grad.py(49): clip_grad_norm_; /home/ma- user/work/algorithms/doc_cls/Bert.py(205): train_epoch; /home/ma- user/work/algorithms/doc_cls/Bert.py(252): <module> | \n",
+ " 2 | \n", + "
torch_npu.optim.NpuFusedAdamW | \n", + "/home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- packages/torch_npu/npu/profiler.py(675): __enter__; /home/ma- user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- packages/torch_npu/npu/profiler.py(719): wrapper; /home/ma- user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- packages/torch/optim/lr_scheduler.py(65): wrapper; /home/ma- user/work/algorithms/doc_cls/Bert.py(219): train_epoch; /home/ma- user/work/algorithms/doc_cls/Bert.py(252): <module> | \n",
+ " 2 | \n", + "