diff --git a/profiler/advisor_review/README.md b/profiler/advisor_review/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..283aa2943881262ffbefaeb7025cf301c17b18fa
--- /dev/null
+++ b/profiler/advisor_review/README.md
@@ -0,0 +1,80 @@
+# advisor
+
+msprof-analyze的advisor功能是将Ascend PyTorch Profiler或者msprof采集的PyThon场景性能数据进行分析,并输出性能调优建议(当前暂不支持对db格式文件分析)。
+
+## 工具使用(命令行方式方式)
+
+1. 参见《[性能工具](../README.md)》完成工具安装。建议安装最新版本。
+
+2. 执行分析。
+
+ - 总体性能瓶颈
+
+ ```bash
+ msprof-analyze advisor all -d [待分析性能数据文件所在路径] -bp [基准性能数据文件所在路径]
+ ```
+
+ - 计算瓶颈
+
+ ```bash
+ msprof-analyze advisor computation -d [待分析性能数据文件所在路径]
+ ```
+
+ - 调度瓶颈
+
+ ```bash
+ msprof-analyze advisor schedule -d [待分析性能数据文件所在路径]
+ ```
+
+
+ -d(必选):待分析性能数据文件所在路径。
+
+ -bp(可选):基准性能数据文件所在路径。
+
+ 单卡场景需要指定到性能数据文件`*_ascend_pt`目录;多卡或集群场景需要指定到`*_ascend_pt`目录的父目录层级。
+
+3. 查看结果。
+
+ 分析结果打屏展示并生成html和csv文件。
+
+## 工具使用(Jupyter Notebook方式)
+
+Jupyter Notebook使用方式如下:
+
+下列以Windows环境下执行为例介绍。
+
+1. 在环境下安装Jupyter Notebook工具。
+
+ ```bash
+ pip install jupyter notebook
+ ```
+
+ Jupyter Notebook工具的具体安装和使用指导请至Jupyter Notebook工具官网查找。
+
+2. 在环境下安装ATT工具。
+
+ ```
+ git clone https://gitee.com/ascend/att.git
+ ```
+
+ 安装环境下保存Ascend PyTorch Profiler采集的性能数据。
+
+3. 进入att\profiler\advisor目录执行如下命令启动Jupyter Notebook工具。
+
+ ```bash
+ jupyter notebook
+ ```
+
+ 执行成功则自动启动浏览器读取att\profiler\advisor目录,如下示例:
+
+ 
+
+ 若在Linux环境下则回显打印URL地址,即是打开Jupyter Notebook工具页面的地址,需要复制URL,并使用浏览器访问(若为远端服务器则需要将域名“**localhost**”替换为远端服务器的IP),进入Jupyter Notebook工具页面。
+
+4. 每个.ipynb文件为一项性能数据分析任务,选择需要的.ipynb打开,并在*_path参数下拷贝保存Ascend PyTorch Profiler采集的性能数据的路径。如下示例:
+
+ 
+
+5. 单击运行按钮执行性能数据分析。
+
+ 分析结果详细内容会在.ipynb页面下展示。
diff --git a/profiler/advisor_review/__init__.py b/profiler/advisor_review/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e79018ed05c6d1cdeb56feaa6182f048e3c8e06f
--- /dev/null
+++ b/profiler/advisor_review/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from profiler.advisor.interface.interface import Interface
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/__init__.py b/profiler/advisor_review/advisor_backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0e9f748f4b10347a874f60cec1fa9f6e5285a5e
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/advice_base.py b/profiler/advisor_review/advisor_backend/advice_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..35939bcea9c87fb09f2113bd19f77ea18ba54e34
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_base.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import abstractmethod
+
+
+class AdviceBase:
+ DATA = "data"
+ BOTTLENECK = "bottleneck"
+ ADVICE = "advice"
+
+ def __init__(self, collection_path: str):
+ self.collection_path = os.path.realpath(collection_path)
+ self.bottelneck = ''
+ self.output_format_data = {
+ self.DATA: [],
+ self.BOTTLENECK: '',
+ self.ADVICE: ''
+ }
+
+ @abstractmethod
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+
+ @abstractmethod
+ def run(self):
+ """
+ analyze profiling data and advice
+ """
+
+ @abstractmethod
+ def output(self):
+ """
+ output relevant data
+ """
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/__init__.py b/profiler/advisor_review/advisor_backend/advice_factory/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0e9f748f4b10347a874f60cec1fa9f6e5285a5e
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..639f4800cfe8c9acdc8fe7ea5f65a43fc8892b2b
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from common_func.path_manager import PathManager
+
+
+class AdviceFactory:
+ def __init__(self, collection_path: str):
+ self.collection_path = os.path.realpath(collection_path)
+
+ @staticmethod
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+
+ def produce_advice(self, advice: str, kwargs: dict):
+ """
+ produce data for input mode and advice
+ """
+ self.path_check()
+ self.advice_check(advice)
+ return self.run_advice(advice, kwargs)
+
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+ PathManager.input_path_common_check(self.collection_path)
+
+ def advice_check(self, advice: str):
+ """
+ check whether input advice is valid
+ """
+ if advice not in self.ADVICE_LIB.keys():
+ msg = '[ERROR]Input advice is illegal.'
+ raise RuntimeError(msg)
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bb93f46704eb13fef14d070f891e350446829ea
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from advice_factory.advice_factory import AdviceFactory
+from cluster_advice.slow_link_advice import SlowLinkAdvice
+from cluster_advice.slow_rank_advice import SlowRankAdvice
+from cluster_advice.cluster_pipeline_advice import ClusterPipelineAdvice
+from cluster_advice.kernel_cluster_advice import KernelClusterAdvice
+from common_func_advisor.constant import Constant
+
+
+class ClusterAdviceFactory(AdviceFactory):
+ ADVICE_LIB = {
+ Constant.SLOW_RANK: SlowRankAdvice,
+ Constant.SLOW_LINK: SlowLinkAdvice,
+ Constant.PIPELINE: ClusterPipelineAdvice,
+ Constant.KERNEL: KernelClusterAdvice
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+ return self.ADVICE_LIB.get(advice)(self.collection_path, kwargs).run()
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..336bef7dd8553eb82586d52260443a7d01e84ab0
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from common_func_advisor.constant import Constant
+from advice_factory.advice_factory import AdviceFactory
+from compute_advice.npu_fused_advice import NpuFusedAdvice
+from compute_advice.npu_slow_advice import NpuSlowAdvice
+
+
+class ComputeAdviceFactory(AdviceFactory):
+ ADVICE_LIB = {
+ Constant.NPU_FUSED: NpuFusedAdvice,
+ Constant.NPU_SLOW: NpuSlowAdvice,
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+ return self.ADVICE_LIB.get(advice)(self.collection_path).run()
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf80cc200f4c3cd1057b7fc28e750948a450cf1
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from advice_factory.advice_factory import AdviceFactory
+from common_func_advisor.constant import Constant
+from overall_advice.overall_summary_advice import OverallSummaryAdvice
+
+
+class OverallAdviceFactory(AdviceFactory):
+ ADVICE_LIB = {
+ Constant.SUMMARY: OverallSummaryAdvice
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+ return self.ADVICE_LIB.get(advice)(self.collection_path, kwargs).run()
diff --git a/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..44b352e95a7bb1007bc7373193603c2a0b9d8b6c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from advice_factory.advice_factory import AdviceFactory
+from common_func_advisor.constant import Constant
+from timeline_advice.optimizer_advice import OptimizerAdvice
+from timeline_advice.op_schedule_advice import OpScheduleAdvice
+
+
+class TimelineAdviceFactory(AdviceFactory):
+ ADVICE_LIB = {
+ Constant.OPTIM: OptimizerAdvice,
+ Constant.OP_SCHE: OpScheduleAdvice,
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ def run_advice(self, advice: str, kwargs: dict):
+ """
+ run advice to produce data
+ """
+ return self.ADVICE_LIB.get(advice)(self.collection_path).run()
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py b/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9be4675963a9cd48da3b4cd91ee646f8e82468b
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import abstractmethod
+from common_func.constant import Constant
+from advice_base import AdviceBase
+from cluster_analysis import Interface
+
+
+class ClusterAdviceBase(AdviceBase):
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+
+ @staticmethod
+ def compute_max_gap_ratio(data: list, mean: float):
+ if mean == 0:
+ return 0
+ else:
+ return (max(data) - min(data)) / mean
+
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+ for file in os.listdir(self.collection_path):
+ if file == 'cluster_analysis_output':
+ print("[INFO]Cluster has been analyzed "
+ "because of the existence of cluster analysis output directory.")
+ print("[INFO]Skip Cluster analyze backend.")
+ return
+ print("[INFO] cluster analysis is in the process, please wait...")
+ self.cluster_analyze()
+
+ def cluster_analyze(self):
+ parameter = {
+ Constant.COLLECTION_PATH: self.collection_path,
+ Constant.ANALYSIS_MODE: "all"
+ }
+ try:
+ Interface(parameter).run()
+ except Exception as e:
+ raise ValueError(f"Cluster analyze backend failed:{e}") from e
+
+ @abstractmethod
+ def run(self):
+ """
+ analyze profiling data and advice
+ """
+
+ @abstractmethod
+ def output(self):
+ """
+ output relevant data
+ """
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f8846f1d99e9bc81636df32d04148df99d12920
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py
@@ -0,0 +1,437 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import multiprocessing
+from typing import Dict
+from typing import Optional
+from typing import Deque
+from typing import List
+from typing import Tuple
+from collections import defaultdict
+from collections import deque
+from decimal import Decimal
+from dataclasses import dataclass
+
+from common_func.file_manager import FileManager
+from common_func_advisor.constant import Constant
+from common_func_advisor.trace_view_preprocessor import FineTraceViewData
+from common_func_advisor.trace_view_preprocessor import TraceViewPreProcessor
+from cluster_advice.cluster_advice_base import ClusterAdviceBase
+from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor
+
+
+@dataclass
+class PipelineTimeSlice:
+ start: str = ""
+ end: str = ""
+ slice_type: str = ""
+ bp_timeslice: list = None
+
+ def __post_init__(self):
+ self.bp_timeslice = self.bp_timeslice or []
+
+
+class PipelineTraceViewer:
+ STAGE_COLOR = "good"
+ BUBBLE_COLOR = "generic_work"
+ FP_COLOR = "good"
+ BP_COLOR = "bad"
+ PIPLINE_VIEW = "Pipeline View"
+ STAGE = "Stage"
+ BUBBLE = "Bubble"
+ FP = "FP"
+ BP = "BP"
+
+ COLORS = {
+ STAGE: STAGE_COLOR,
+ BUBBLE: BUBBLE_COLOR,
+ FP: FP_COLOR,
+ BP: BP_COLOR
+ }
+
+ def _gen_trace_pair(self, name: str, start_ts: str, end_ts: str, pid: str, tid: str) -> Dict:
+ data = {
+ Constant.OP_NAME: name,
+ Constant.CNAME: self.COLORS.get(name, self.BUBBLE),
+ Constant.PH: Constant.PH_X,
+ Constant.PID: pid,
+ Constant.OP_TID: tid,
+ Constant.TS: start_ts,
+ Constant.DUR: str(Decimal(end_ts) - Decimal(start_ts))
+ }
+
+ return data
+
+ def gen_stage_bubble_trace_data(self, rank_id: int, timeslice_list: List[PipelineTimeSlice]) -> List[Dict]:
+ """
+ generate stage bubble trace json data
+ """
+ rank_str = f'Rank {rank_id}'
+ trace_data = []
+
+ for timeslice in timeslice_list:
+ data = self._gen_trace_pair(timeslice.slice_type, timeslice.start,
+ timeslice.end, self.PIPLINE_VIEW, rank_str)
+ trace_data.append(data)
+
+ return trace_data
+
+ def gen_fp_bp_trace_data(self, rank_id: int, timeslice_list: List[PipelineTimeSlice]) -> List[Dict]:
+ """
+ generate fp bp trace json data
+ """
+ rank_str = f'Rank {rank_id}'
+ trace_data = []
+
+ for timeslice in timeslice_list:
+ if timeslice.slice_type == self.BUBBLE:
+ data = self._gen_trace_pair(timeslice.slice_type, timeslice.start,
+ timeslice.end, self.PIPLINE_VIEW, rank_str)
+ trace_data.append(data)
+ else:
+ last_end = timeslice.start
+ for bp_bound in timeslice.bp_timeslice:
+ data = self._gen_trace_pair(self.FP, last_end,
+ bp_bound[0], self.PIPLINE_VIEW, rank_str)
+ trace_data.append(data)
+ last_end = bp_bound[1]
+
+ data = self._gen_trace_pair(self.BP, bp_bound[0],
+ bp_bound[1], self.PIPLINE_VIEW, rank_str)
+ trace_data.append(data)
+
+ last_data = self._gen_trace_pair(self.FP, last_end,
+ timeslice.end, self.PIPLINE_VIEW, rank_str)
+ trace_data.append(last_data)
+
+ return trace_data
+
+
+class ClusterPipelineAdvice(ClusterAdviceBase):
+ BUBBLE = "Bubble"
+ STAGE = "Stage"
+ PIPELINE_VIEW = "Pipeline View"
+ SAVE_JSON = "pipeline_view.json"
+
+ def __init__(self, collection_path: str, kwargs: dict):
+ super().__init__(collection_path)
+ self.rank_ids = list(set(kwargs.get("rank_ids", [])))
+ self.worker_num = kwargs.get("worker_num", int(multiprocessing.cpu_count() / 2))
+ self.rank_prof_dirs = {}
+ self.cur_data = []
+ self.cur_bottleneck = {}
+ self.cur_advices = ""
+
+ def run(self) -> dict:
+ """
+ Unified entrance interface
+ """
+ self.rank_prof_dirs = self.get_rank_prof_dirs(self.rank_ids)
+ if not self.rank_prof_dirs:
+ print("[ERROR] No rank profiling data found, please check the rank ids or dir path.")
+ return {}
+
+ self.process()
+ self.output()
+ self.identify_bottleneck()
+ return self.output_format_data
+
+ def process(self) -> None:
+ """
+ process all rank profiling data by using multi-process
+ """
+ start_time = time.time()
+ print(f"[INFO] Start to process {len(self.rank_prof_dirs)} rank profiling data with {self.worker_num} workers.")
+ with multiprocessing.Pool(self.worker_num) as pool:
+ results = pool.map(self.work, self.rank_prof_dirs.items())
+
+ for (rank_id, _), (res, show_fp_bp) in zip(self.rank_prof_dirs.items(), results):
+ if show_fp_bp:
+ self.cur_data += PipelineTraceViewer().gen_fp_bp_trace_data(rank_id, res)
+ else:
+ self.cur_data += PipelineTraceViewer().gen_stage_bubble_trace_data(rank_id, res)
+ print(f"[INFO] Pipline view data process finished, cost {time.time() - start_time:.2f}s.")
+
+ @staticmethod
+ def _align_trace_bound(results: List) -> None:
+ """
+ align all rank trace bound for better visualization
+ """
+ start_list, end_list = [], []
+ for res in results:
+ start_list.append(res[0].start)
+ end_list.append(res[-1].end)
+
+ # update all rank trace bound
+ for res in results:
+ res[0].start = min(start_list)
+ res[-1].end = max(end_list)
+
+ def work(self, kv: Tuple[int, str]) -> Tuple[List[PipelineTimeSlice], bool]:
+ """
+ single process worker function
+ """
+ show_fp_bp = False
+ rank_id, rank_prof_dir = kv
+ print(f"[INFO] [Rank {rank_id}] Start to process rank profiling data.")
+ json_path = os.path.join(rank_prof_dir, Constant.ASCEND_PROFILER_OUTPUT, Constant.TRACE_VIEW_JSON)
+ fine_data = self.load_trace_view_data(json_path)
+ if not fine_data.hcom_ops or not fine_data.hcom_tids:
+ print(f"[ERROR] [Rank {rank_id}] No hcom send recv ops found, make sure the trace view data is pipeline "
+ f"parallel sense.")
+ return [], show_fp_bp
+
+ timeslice_list = self.get_pipeline_timeslice(fine_data.hcom_ops, fine_data.hcom_tids, fine_data.min_ts,
+ fine_data.max_ts)
+ if not fine_data.fp_ops or not fine_data.bp_ops:
+ print(f"[INFO] [Rank {rank_id}] No frameWork data in trace view, only show stage and bubble.")
+ elif len(fine_data.hcom_tids) > 1:
+ print(f"[WARN] [Rank {rank_id}] More than one hcom tid found, only show stage and bubble.")
+ else:
+ print(f"[INFO] [Rank {rank_id}] Found frameWork data in trace view, show fp bp and bubble.")
+ bp_ops = self.get_fp_bp_bound_ops(fine_data)
+ self.update_stage_fp_bp(timeslice_list, bp_ops)
+ show_fp_bp = True
+ print(f"[INFO] [Rank {rank_id}] Rank profiling data process finished.")
+
+ return timeslice_list, show_fp_bp
+
+ def identify_bottleneck(self) -> None:
+ pass
+
+ def output(self) -> None:
+ """
+ output result
+ """
+ self.cur_data.append(
+ {
+ Constant.OP_NAME: Constant.PROCESS_NAME,
+ Constant.PH: Constant.PH_META,
+ Constant.PID: self.PIPELINE_VIEW,
+ Constant.OP_TID: self.PIPELINE_VIEW,
+ Constant.ARGS: {
+ Constant.OP_NAME: self.PIPELINE_VIEW
+ }
+ }
+ )
+ self.output_format_data[self.DATA] = self.cur_data
+ self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck
+ self.output_format_data[self.ADVICE] = self.cur_advices
+
+ def get_rank_prof_dirs(self, rank_ids: list) -> Dict[int, str]:
+ """
+ get rank profiling directories by rank ids
+ """
+ rank_prof_dirs = defaultdict(str)
+ prof_dirs = []
+ for prof_dir in os.listdir(self.collection_path):
+ if prof_dir.endswith(Constant.PT_PROF_SUFFIX):
+ prof_dirs.append(os.path.join(self.collection_path, prof_dir))
+
+ data_map = PytorchDataPreprocessor(prof_dirs).get_data_map()
+ for rank_id in rank_ids:
+ if rank_id in data_map:
+ rank_prof_dirs[rank_id] = data_map[rank_id]
+ else:
+ print(f'[Warning] Rank {rank_id} not found in {self.collection_path}')
+
+ return rank_prof_dirs
+
+ @staticmethod
+ def load_trace_view_data(json_path) -> Optional[FineTraceViewData]:
+ """
+ load trace view data from json file and preprocess
+ """
+ raw_data = FileManager.read_json_file(json_path)
+ return TraceViewPreProcessor().process(raw_data)
+
+ @staticmethod
+ def double_queue_pop(fp_que: Deque[dict], bp_que: Deque[dict]) -> Tuple[list, list]:
+ """
+ double queue (fp and bp que) pop alternating algorithm implementation
+ """
+ res_fp_ops, res_bp_ops = [], []
+ pop_fp = fp_que[0][Constant.TS] < bp_que[0][Constant.TS]
+ fp_start_op, fp_end_op = fp_que[0], fp_que[0]
+ bp_start_op, bp_end_op = bp_que[0], bp_que[0]
+
+ def update_bound_op(que: Deque[dict], start_op: dict, end_op: dict) -> Tuple[dict, dict]:
+ """
+ update fp and bp bound op
+ """
+ op = que.popleft()
+ op_s = Decimal(op[Constant.TS])
+ op_e = op_s + Decimal(op[Constant.DUR])
+
+ start_op = op if op_s < Decimal(start_op[Constant.TS]) else start_op
+ end_op = op if op_e > Decimal(end_op[Constant.TS]) + Decimal(end_op[Constant.DUR]) else end_op
+
+ return start_op, end_op
+
+ while fp_que and bp_que:
+ if pop_fp:
+ if len(fp_que) > 1 and bp_que and fp_que[1][Constant.TS] > bp_que[0][Constant.TS]:
+ pop_fp = False # pop bp que
+ if len(fp_que) == 1:
+ pop_fp = False # pop bp que
+
+ fp_start_op, fp_end_op = update_bound_op(fp_que, fp_start_op, fp_end_op)
+
+ # time to pop bp que, need to record fp ops and update bp start op
+ if not pop_fp:
+ res_fp_ops.append((fp_start_op, fp_end_op))
+ if fp_que:
+ bp_start_op, bp_end_op = bp_que[0], bp_que[0]
+ else:
+ if len(bp_que) > 1 and fp_que and bp_que[1][Constant.TS] > fp_que[0][Constant.TS]:
+ pop_fp = True # pop fp que
+ if len(bp_que) == 1:
+ pop_fp = True # pop fp que
+
+ bp_start_op, bp_end_op = update_bound_op(bp_que, bp_start_op, bp_end_op)
+
+ # time to pop fp que, need to record bp ops and update fp start op
+ if pop_fp:
+ res_bp_ops.append((bp_start_op, bp_end_op))
+ if bp_que:
+ fp_start_op, fp_end_op = fp_que[0], fp_que[0]
+
+ if fp_que:
+ fp_start_op, fp_end_op = fp_que[0], fp_que[0]
+ while fp_que:
+ fp_start_op, fp_end_op = update_bound_op(fp_que, fp_start_op, fp_end_op)
+ res_fp_ops.append((fp_start_op, fp_end_op))
+
+ if bp_que:
+ bp_start_op, bp_end_op = bp_que[0], bp_que[0]
+ while bp_que:
+ bp_start_op, bp_end_op = update_bound_op(bp_que, bp_start_op, bp_end_op)
+ res_bp_ops.append((bp_start_op, bp_end_op))
+
+ return res_fp_ops, res_bp_ops
+
+ @staticmethod
+ def update_ops_time(ops_list: List[List[dict]], torch_to_npu_links: List[dict],
+ npu_ops_ts_dur: dict) -> List[List[dict]]:
+ """
+ update fp and bp bound ops time at device by using torch_to_npu_links
+ """
+ ops_que = deque(ops_list)
+ torch_to_npu_que = deque(torch_to_npu_links)
+ res = []
+ link_stack = []
+ while ops_que and torch_to_npu_que:
+ link = torch_to_npu_que.popleft()
+ link_s = Decimal(link[Constant.TS])
+
+ # bound op at framework level
+ cpu_op_l, cpu_op_r = ops_que[0][0], ops_que[0][1]
+ cpu_op_s = Decimal(cpu_op_l[Constant.TS])
+ cpu_op_e = Decimal(cpu_op_r[Constant.TS]) + Decimal(cpu_op_r[Constant.DUR])
+
+ if cpu_op_s < link_s < cpu_op_e:
+ link_stack.append(link)
+ if link_s > cpu_op_e or \
+ (link_stack and not torch_to_npu_que):
+ min_link = link_stack[0]
+ max_link = link_stack[-1]
+
+ min_link_s = str(min_link[Constant.ID])
+ max_link_s = str(max_link[Constant.ID])
+ # for compatibility with old data (ts is float type)
+ if isinstance(min_link[Constant.ID], float):
+ cpu_op_l["npu_op_ts"] = min_link_s
+ cpu_op_r["npu_op_ts"] = max_link_s
+ else:
+ cpu_op_l["npu_op_ts"] = f"{min_link_s[:-3]}.{min_link_s[-3:]}"
+ cpu_op_r["npu_op_ts"] = f"{max_link_s[:-3]}.{max_link_s[-3:]}"
+ cpu_op_l["npu_op_dur"] = npu_ops_ts_dur.get(cpu_op_l["npu_op_ts"], 0)
+ cpu_op_r["npu_op_dur"] = npu_ops_ts_dur.get(cpu_op_r["npu_op_ts"], 0)
+
+ res.append([cpu_op_l, cpu_op_r])
+ ops_que.popleft()
+ link_stack.clear()
+
+ return res
+
+ def get_fp_bp_bound_ops(self, fine_data: FineTraceViewData) -> List[List[dict]]:
+ """
+ get fp and bp bound ops by using double queue alternating pop algorithm and
+ update fp and bp bound ops time at device by using torch_to_npu_links
+ """
+ fp_que = deque(fine_data.fp_ops)
+ bp_que = deque(fine_data.bp_ops)
+
+ # get fp and bp bound ops
+ _, res_bp_ops = self.double_queue_pop(fp_que, bp_que)
+
+ # according to torch_to_npu_links, split fp and bp timeslice
+ bp_ops = self.update_ops_time(res_bp_ops, fine_data.torch_to_npu_links, fine_data.npu_ops_ts_dur)
+ return bp_ops
+
+ def get_pipeline_timeslice(self, hcom_ops: list, hcom_tids: list,
+ min_ts: str, max_ts: str) -> List[PipelineTimeSlice]:
+ """
+ get pipeline timeslice by using hcom ops
+ """
+ timeslice_list = []
+ last_op_end = None
+ if len(hcom_tids) > 1:
+ print("[WARN] More than one hcom tid found, default to show minimal tid pipeline view.")
+
+ for op in hcom_ops:
+ if op[Constant.OP_TID] == min(hcom_tids):
+ # gap between two hcom ops
+ if last_op_end:
+ timeslice_list.append(PipelineTimeSlice(str(last_op_end), op[Constant.TS], self.STAGE))
+ # hcom op
+ last_op_end = Decimal(op[Constant.TS]) + Decimal(op[Constant.DUR])
+ timeslice_list.append(PipelineTimeSlice(op[Constant.TS], str(last_op_end), self.BUBBLE))
+
+ # add start STAGE and end STAGE
+ timeslice_list.insert(0, PipelineTimeSlice(min_ts, timeslice_list[0].start, self.STAGE))
+ timeslice_list.insert(len(timeslice_list), PipelineTimeSlice(timeslice_list[-1].end, max_ts, self.STAGE))
+ return timeslice_list
+
+ def update_stage_fp_bp(self, timeslice_list: List[PipelineTimeSlice],
+ bp_ops: List[List[dict]]) -> None:
+ """
+ update stage fp and bp time
+ """
+ pipeline_que = deque(timeslice_list)
+ bp_bound_que = deque(bp_ops)
+
+ while pipeline_que and bp_bound_que:
+ while pipeline_que[0].slice_type != self.STAGE:
+ pipeline_que.popleft()
+ if not pipeline_que:
+ return None
+
+ bp_bound_data = bp_bound_que[0]
+ bp_bound_s = Decimal(bp_bound_data[0]['npu_op_ts'])
+ bp_bound_e = Decimal(bp_bound_data[1]['npu_op_ts']) + Decimal(bp_bound_data[1]['npu_op_dur'])
+
+ pipeline_s = Decimal(pipeline_que[0].start)
+ pipeline_e = Decimal(pipeline_que[0].end)
+
+ if pipeline_s <= bp_bound_s and bp_bound_e <= pipeline_e:
+ pipeline_que[0].bp_timeslice.append((str(bp_bound_s), str(bp_bound_e)))
+ bp_bound_que.popleft()
+ elif bp_bound_s > pipeline_e:
+ pipeline_que.popleft()
+ else:
+ bp_bound_que.popleft()
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fa83c765f5fe1f4ac20dcc62895fe0450e338ce
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py
@@ -0,0 +1,62 @@
+import os
+import pandas as pd
+from common_func.path_manager import PathManager
+from common_func.constant import Constant
+from common_func_advisor.constant import Constant as AdvisorConstant
+from cluster_advice.cluster_advice_base import ClusterAdviceBase
+from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor
+
+
+class KernelClusterAdvice(ClusterAdviceBase):
+ COLUMNS_TO_GROUP = ["Name", "Input Shapes", "Input Data Types", "Output Shapes"]
+ COLUMNS_TO_CAL = ["Duration(us)"]
+ CAL_FUN = ['mean', 'var', 'max', 'min', 'count', 'sum']
+
+ def __init__(self, collection_path: str, kwargs: dict = None):
+ super().__init__(collection_path)
+ self.all_kernel_data = pd.DataFrame()
+
+ def run(self):
+ self.load_kernel_details_data()
+ return self.calculate_data()
+
+ def load_kernel_details_data(self):
+ prof_dirs = self.get_prof_dirs(self.collection_path)
+ if not prof_dirs:
+ msg = "[ERROR] There is no profile in this collection path, terminate analysis."
+ raise RuntimeError(msg)
+
+ data_map = PytorchDataPreprocessor(prof_dirs).get_data_map()
+ self.all_kernel_data = pd.DataFrame()
+ for rank_id, profiling_dir_path in data_map.items():
+ kernel_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.KERNEL_DETAILS_CSV)
+ if kernel_file:
+ # 判断csv文件大小
+ PathManager.check_path_readable(kernel_file)
+ # 读取CSV文件
+ df_temp = pd.read_csv(kernel_file)
+ columns_to_keep = self.COLUMNS_TO_GROUP + self.COLUMNS_TO_CAL
+ if [1 for element in columns_to_keep if element not in list(df_temp)]:
+ msg = "[ERROR] Kernel details.csv has wrong data columns, terminate analysis."
+ raise RuntimeError(msg)
+ df = df_temp[columns_to_keep]
+ df.insert(loc=0, column='rank id', value=rank_id)
+ # 将数据添加到最终的数据框中
+ self.all_kernel_data = pd.concat([self.all_kernel_data, df], ignore_index=True)
+
+ def calculate_data(self):
+ # 存储所有合并后的数据
+ calculate_dict = {self.COLUMNS_TO_CAL[i]: self.CAL_FUN
+ for i in range(len(self.COLUMNS_TO_CAL))}
+ group_col = ["rank id"] + self.COLUMNS_TO_GROUP
+ view_data = self.all_kernel_data.groupby(group_col).agg(calculate_dict).reset_index()
+ view_data.columns = [''.join(col) if col[1] == "" else '_'.join(col) for col in view_data.columns]
+ return view_data
+
+ def get_prof_dirs(self, collection_path):
+ prof_dirs = []
+ for prof_dir in os.listdir(collection_path):
+ if prof_dir.endswith(AdvisorConstant.PT_PROF_SUFFIX):
+ prof_dirs.append(os.path.join(collection_path, prof_dir))
+
+ return prof_dirs
\ No newline at end of file
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8a625242f3939602cbb7b8391cd8062e21fe01b
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import defaultdict
+from common_func_advisor.constant import Constant
+from common_func.file_manager import FileManager
+from cluster_advice.cluster_advice_base import ClusterAdviceBase
+
+
+class SlowLinkAdvice(ClusterAdviceBase):
+ RDMA_TIME_MS = "RDMA time(ms)"
+ RDMA_SIZE_MB = "RDMA size(mb)"
+ SDMA_TIME_MS = "SDMA time(ms)"
+ SDMA_SIZE_MB = "SDMA size(mb)"
+ RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)"
+ SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)"
+ COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info"
+ TRANSIT_TIME = "Transit Time(ms)"
+ TRANSIT_SIZE = "Transit Size(MB)"
+ SDMA = "SDMA"
+ RDMA = "RDMA"
+
+ def __init__(self, collection_path: str, kwargs: dict = None):
+ super().__init__(collection_path)
+ self.rank_bw_dict = defaultdict(lambda: {
+ self.RDMA_TIME_MS: 0,
+ self.RDMA_SIZE_MB: 0,
+ self.SDMA_TIME_MS: 0,
+ self.SDMA_SIZE_MB: 0,
+ })
+
+ @staticmethod
+ def compute_ratio(dividend: float, divisor: float):
+ if abs(divisor) < 1e-15:
+ return 0
+ else:
+ return round(dividend / divisor, 4)
+
+ def load_communication_json(self):
+ json_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT, Constant.CLUSTER_COMM_JSON)
+ if not os.path.exists(json_path):
+ msg = "[ERROR] cluster_communication.json doesn't exist, terminate analysis."
+ raise RuntimeError(msg)
+ communication_json = FileManager.read_json_file(json_path)
+ return communication_json
+
+ def run(self):
+ self.path_check()
+ communication_json = self.load_communication_json()
+ self.process(communication_json)
+ self.output()
+ return self.output_format_data
+
+ def process(self, communication_json: dict):
+ for comm_group, group_dict in communication_json.items():
+ for step, step_dict in group_dict.items():
+ for op, op_dict in step_dict.items():
+ self.compute_bandwidth(op_dict)
+ if self.rank_bw_dict:
+ self.produce_bottleneck(self.RDMA_BANDWIDTH)
+ self.produce_bottleneck(self.SDMA_BANDWIDTH)
+
+ def compute_bandwidth(self, op_dict: dict):
+ for rank_id, rank_dict in op_dict.items():
+ try:
+ rank = int(rank_id)
+ except ValueError as e:
+ msg = "[ERROR] Cluster_communication.json has invalid structure."
+ raise ValueError(msg) from e
+ for comm_type, bw_dict in rank_dict.get(self.COMMUNICATION_BANDWIDTH_INFO, {}).items():
+ if comm_type == self.SDMA:
+ self.rank_bw_dict[rank][self.SDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE)
+ self.rank_bw_dict[rank][self.SDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME)
+ if comm_type == self.RDMA:
+ self.rank_bw_dict[rank][self.RDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE)
+ self.rank_bw_dict[rank][self.RDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME)
+
+ for rank, rank_dict in self.rank_bw_dict.items():
+ self.rank_bw_dict[rank][self.RDMA_BANDWIDTH] = self.compute_ratio(
+ self.rank_bw_dict[rank][self.RDMA_SIZE_MB], self.rank_bw_dict[rank][self.RDMA_TIME_MS])
+ self.rank_bw_dict[rank][self.SDMA_BANDWIDTH] = self.compute_ratio(
+ self.rank_bw_dict[rank][self.SDMA_SIZE_MB], self.rank_bw_dict[rank][self.SDMA_TIME_MS])
+
+ def produce_bottleneck(self, link_type: str):
+ data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()]
+ avg_bw = round(sum(data_list) / len(data_list), 3)
+ if avg_bw == 0:
+ return
+ self.bottelneck += f'{link_type}: \n' \
+ f'The average is {avg_bw}, ' \
+ f'while the maximum is {round(max(data_list), 3)}GB/s and ' \
+ f'the minimum is {round(min(data_list), 3)}GB/s. ' \
+ f'the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n'
+
+ def output(self):
+ self.output_format_data[self.DATA] = self.rank_bw_dict
+ self.output_format_data[self.BOTTLENECK] = self.bottelneck
diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e789fb7fb688626df7e8f5b25b84e4955d6c2a3
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import defaultdict
+from common_func_advisor.constant import Constant
+from common_func.file_manager import FileManager
+from cluster_advice.cluster_advice_base import ClusterAdviceBase
+from prof_bean_advisor.cluster_step_trace_time_bean import ClusterStepTraceTimeBean
+
+
+class SlowRankAdvice(ClusterAdviceBase):
+ RANK = "rank"
+ RATIO_THRESHOLD = 0.05
+ BOTTLENECK_LIST = ['Computing', 'Communication', "Free"]
+
+ def __init__(self, collection_path: str, kwargs: dict = None):
+ super().__init__(collection_path)
+
+ def load_step_time(self):
+ csv_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT, Constant.CLUSTER_STEP_TIME_CSV)
+ if not os.path.exists(csv_path):
+ msg = "[ERROR] cluster_step_trace_time.csv doesn't exist, terminate analysis."
+ raise RuntimeError(msg)
+ step_time = FileManager.read_csv_file(csv_path, ClusterStepTraceTimeBean)
+ return step_time
+
+ def run(self):
+ self.path_check()
+ step_data = self.load_step_time()
+ step_dict = self.process(step_data)
+ self.output(step_dict)
+ return self.output_format_data
+
+ def process(self, step_data: list):
+ step_dict = defaultdict(lambda: [0, 0, 0, 0])
+ for step_bean in step_data:
+ if step_bean.type == self.RANK:
+ step_dict[step_bean.index][0] += step_bean.compute
+ step_dict[step_bean.index][1] += step_bean.communication
+ step_dict[step_bean.index][2] += step_bean.free
+ total_time_list = [sum(data_tuple) for rank_id, data_tuple in step_dict.items()]
+ if total_time_list:
+ mean_total_time = sum(total_time_list) / len(total_time_list)
+ for i in range(len(self.BOTTLENECK_LIST)):
+ self.produce_bottleneck(step_dict, i, mean_total_time)
+ return step_dict
+
+ def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float):
+ data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()]
+ max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time)
+ if max_ratio > self.RATIO_THRESHOLD:
+ self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} has some issues in the cluster, ' \
+ f'because the max difference of {self.BOTTLENECK_LIST[produce_type]} time ' \
+ f'has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n'
+
+ def output(self, step_dict: dict):
+ self.output_format_data[self.DATA] = step_dict
+ self.output_format_data[self.BOTTLENECK] = self.bottelneck
diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py b/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py b/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..46a7fb24c2dade75c157f18118f29233eb924b88
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+
+
+class CsvTitle:
+ MODEL_NAME = "Model Name"
+ MODEL_ID = "Model ID"
+ TASK_ID = "Task ID"
+ STREAM_ID = "Stream ID"
+ INFER_ID = "Infer ID"
+ TASK_START_TIME = "Task Start Time(us)"
+ TASK_WAIT_TIME = "Task Wait Time(us)"
+ BLOCK_DIM = "Block Dim"
+ MIX_BLOCK_DIM = "Mix Block Dim"
+ HF32_ELIGIBLE = "HF32 Eligible"
+ INPUT_SHAPES = "Input Shapes"
+ INPUT_DATA_TYPES = "Input Data Types"
+ INPUT_FORMATS = "Input Formats"
+ OUTPUT_SHAPES = "Output Shapes"
+ OUTPUT_DATA_TYPES = "Output Data Types"
+ OUTPUT_FORMATS = "Output Formats"
+ CONTEXT_ID = "Context ID"
+ AICORE_TIME = "aicore_time(us)"
+ AIC_TOTAL_CYCLES = "aic_total_cycles"
+ AIC_MAC_TIME = "aic_mac_time(us)"
+ AIC_MAC_RATIO = "aic_mac_ratio"
+ AIC_SCALAR_TIME = "aic_scalar_time(us)"
+ AIC_SCALAR_RATIO = "aic_scalar_ratio"
+ AIC_MTE1_TIME = "aic_mte1_time(us)"
+ AIC_MTE1_RATIO = "aic_mte1_ratio"
+ AIC_MTE2_TIME = "aic_mte2_time(us)"
+ AIC_MTE2_RATIO = "aic_mte2_ratio"
+ AIC_FIXPIPE_TIME = "aic_fixpipe_time(us)"
+ AIC_FIXPIPE_RATIO = "aic_fixpipe_ratio"
+ AIC_ICACHE_MISS_RATE = "aic_icache_miss_rate"
+ AIV_TIME = "aiv_time(us)"
+ AIV_TOTAL_CYCLES = "aiv_total_cycles"
+ AIV_VEC_TIME = "aiv_vec_time(us)"
+ AIV_VEC_RATIO = "aiv_vec_ratio"
+ AIV_SCALAR_TIME = "aiv_scalar_time(us)"
+ AIV_SCALAR_RATIO = "aiv_scalar_ratio"
+ AIV_MTE2_TIME = "aiv_mte2_time(us)"
+ AIV_MTE2_RATIO = "aiv_mte2_ratio"
+ AIV_MTE3_TIME = "aiv_mte3_time(us)"
+ AIV_MTE3_RATIO = "aiv_mte3_ratio"
+ AIV_ICACHE_MISS_RATE = "aiv_icache_miss_rate"
+ CUBE_UTILIZATION = "cube_utilization( %)"
+ TASK_DURATION_SUM = "Task Duration Sum(us)"
+ TASK_DURATION_MEAN = "Task Duration Mean(us)"
+ TASK_DURATION_STD = "Task Duration Std(us)"
+ TASK_DURATION_RATIO = "Task Duration Ratio(100%)"
+ SIZE = "size(MB)"
+ THROUGHPUT = "throughput(GB/s)"
+ COLOR = "color"
+ GAP = "Gap(us)"
+ DURATION_SUM = "Duration Sum(us)"
+ COUNT = "Count"
+ MAX_DURATION = "Max Duration(us)"
+ MIN_DURATION = "Min Duration(us)"
+ AVG_DURATION = "Avg Duration(us)"
+ DURATION_RATIO = "Duration Ratio"
+ INDEX = "Index"
+
+
+# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配旧版csv
+class CsvTitleV1(CsvTitle):
+ OP_NAME = "Op Name"
+ OP_TYPE = "OP Type"
+ TASK_TYPE = "Task Type"
+ TASK_DURATION = "Task Duration(us)"
+
+
+# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配新版csv
+class CsvTitleV2(CsvTitle):
+ OP_NAME = "Name"
+ OP_TYPE = "Type"
+ TASK_TYPE = "Accelerator Core"
+ TASK_DURATION = "Duration(us)"
+
+
+class Constant:
+ DTYPE_SIZE_MAP = {"int8": 1, "uint8": 1,
+ "int16": 2, "uint16": 2,
+ "int32": 4, "uint32": 4,
+ "int64": 8, "uint64": 8,
+ "float16": 2,
+ "bfloat16": 2,
+ "bf16": 2,
+ "dt_bf16": 2,
+ "float32": 4,
+ "float": 4,
+ "float64": 8,
+ "complex64": 8,
+ "complex128": 16,
+ "bool": 1}
+ TP_THRESHOLD = 1150
+ MAX_INPUT_MODE_LEN = 30
+ MAX_INPUT_ADVICE_LEN = 30
+ SMALL_OP_DUR_RATIO = 0.2
+ SMALL_OP_NUM_RATIO = 0.2
+ BYTE_UNIT_TRANS = 1024
+ UNIT_TRANS = 1000
+
+ # mode list
+ COMPUTE = "compute"
+ TIMELINE = "timeline"
+ CLUSTER = "cluster"
+ OVERALL = "overall"
+ PIPELINE = "pipeline"
+
+ # advice list
+ SLOW_RANK = "slow rank"
+ SLOW_LINK = "slow link"
+ KERNEL = "kernel"
+
+ # compute
+ NPU_FUSED = "npu_fused"
+ NPU_SLOW = "npu_slow"
+
+ # timeline
+ OPTIM = "optimizer"
+ OP_SCHE = "op_schedule"
+
+ # overall
+ SUMMARY = "summary"
+
+ PT_PROF_SUFFIX = "ascend_pt"
+ ASCEND_PROFILER_OUTPUT = "ASCEND_PROFILER_OUTPUT"
+ COLLECTION_PATH = "collection_path"
+ CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output"
+ KERNEL_DETAILS_CSV = "kernel_details.csv"
+ CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv"
+ CLUSTER_COMM_JSON = "cluster_communication.json"
+
+ # pipline
+ OP_NAME = "name"
+ OP_TID = "tid"
+ PID = "pid"
+ TS = "ts"
+ DUR = "dur"
+ CAT = "cat"
+ ARGS = "args"
+ PH = "ph"
+ ID = "id"
+ PH_START = "s"
+ PH_BEGIN = "B"
+ PH_END = "E"
+ PH_META = "M"
+ PH_X = "X"
+ CNAME = "cname"
+ PROCESS_NAME = "process_name"
+ FRAMEWORK_NAME = "Python"
+ ASCEND_HARDWARE_NAME = "Ascend Hardware"
+ ASYNC_NPU = "async_npu"
+ STEP_PREFIX = "ProfilerStep#"
+ FP_ATEN_OP = "aten"
+ FP_C10D_OP = "c10d"
+ HCOM_OP_PREFIX = "hcom_"
+ BP_AUTOGRAD_OP = "autograd"
+ TRACE_VIEW_JSON = "trace_view.json"
+
+ # pattern_dict key: pattern, value: pattern name
+ PATTERN_DICT = {("Add", "DropOutDoMask", "Add"): "bias_dropout_add",
+ ("BatchMatMul", "Mul", "Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast", "DropOutDoMask",
+ "AsStrided", "BatchMatMul", "Transpose"): "FA",
+ ("Transpose", "Transpose", "Transpose", "Mul", "Transpose", "BatchMatMulV2", "MaskedFill",
+ "Cast", "SoftmaxV2", "Cast", "DropOutDoMask", "BatchMatMulV2", "Transpose"): "FA",
+ ("Transpose", "BatchMatMulV2", "Transpose", "Transpose", "BatchMatMulV2", "ZerosLike",
+ "DropOutDoMask", "Cast", "SoftmaxGrad", "Cast", "MaskedFill", "BatchMatMulV2",
+ "BatchMatMulV2", "Mul"): "FA",
+ ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Cast", "Cast", "Mul", "Cast", "Cast",
+ "Mul", "Cast"): "RMSNORM",
+ ("Cast", "LayerNorm", "Cast"): "LayerNorm",
+ ("Add", "LayerNorm"): "AddLayerNorm",
+ ("Add", "LayerNormV3"): "AddLayerNorm",
+ ("Gelu", "Add"): "GeluAdd",
+ ("Cast", "Square", "MemSet", "ReduceMean", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "RMSNorm",
+ ("BatchMatMul", "RealDiv", "Add", "Maximum", "SoftmaxV2", "Cast", "BatchMatMul"): "FA",
+ ("BatchMatMulV2", "RealDiv", "Add", "Cast", "Maximum", "Cast", "SoftmaxV2", "AsStrided",
+ "BatchMatMulV2"): "FA",
+ ("BatchMatMulV2", "RealDiv", "Add", "Cast", "SoftmaxV2", "Cast", "BroadcastTo",
+ "BatchMatMulV2"): "FA",
+ ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Cast", "Mul", "Add"): "RotaryMul",
+ ("Mul", "AsStrided", "Neg", "AsStrided", "ConcatD", "Mul", "Add"): "RotaryMul",
+ ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul", "Add"): "RotaryMul",
+ ("MatMulV2", "Swish", "MatMulV2", "Mul", "MatMulV2"): "FFN",
+ ("Transpose", "Transpose", "GatherElement", "Transpose"): "GatherElement",
+ ("Slice", "Slice", "Swish", "Mul"): "torch_npu.npu_swiglu",
+ ("Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast"): "torch_npu.npu_scaled_masked_softmax",
+ ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul"): "torch_npu.npu_rotary_mul",
+ ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "torch_npu.npu_rms_norm"}
+ TITLE = CsvTitleV2
+
+ @classmethod
+ def update_title(cls):
+ cls.TITLE = CsvTitleV1
+
+
+class CoreType:
+ AIV = "AI_VECTOR_CORE"
+ AIC = "AI_CORE"
+ AICPU = "AI_CPU"
+ MIX_AIV = "MIX_AIV"
+ MIX_AIC = "MIX_AIC"
+ HCCL = "HCCL"
+
+
+class PerfColor(Enum):
+ WHITE = 0
+ GREEN = 1
+ YELLOW = 2
+ RED = 3
diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py
new file mode 100644
index 0000000000000000000000000000000000000000..8171f06ee235fc02da715044b4d310087c36c102
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from abc import abstractmethod
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Dict
+from typing import List
+
+import pandas as pd
+
+from common_func.file_manager import FileManager
+
+
+@dataclass
+class TraceObj:
+ ph: str = ""
+ bp: str = ""
+ cat: str = ""
+ name: str = ""
+ pid: int = 0
+ tid: int = 0
+ id: int = 0
+ ts: str = ""
+ dur: float = 0.0
+ args: dict = field(default='unknown')
+
+ @abstractmethod
+ def hash(self):
+ raise Exception("To be implemented")
+
+ def valid(self):
+ return self.name != ""
+
+ def check_hashable(self):
+ if not self.valid():
+ raise Exception("Illegal {} to hash".format(self.__class__.name))
+
+
+@dataclass
+class Process(TraceObj):
+ def hash(self):
+ self.check_hashable()
+ # msprof 保证name唯一性
+ return self.args.get("name")
+
+
+@dataclass
+class Thread(TraceObj):
+ def hash(self):
+ self.check_hashable()
+ # msprof 保证name唯一性
+ return self.args.get("name")
+
+
+@dataclass
+class DurationEvent(TraceObj):
+ def hash(self):
+ self.check_hashable()
+ return self.ts
+
+
+@dataclass
+class FlowEvent(TraceObj):
+ s_point_ts: str = ""
+ e_point_ts: str = ""
+
+ def hash(self):
+ self.check_hashable()
+ return self.e_point_ts
+
+
+class TraceViewJson:
+
+ def __init__(self, path):
+ self.processes: Dict[str, Process] = dict()
+ self.threads: Dict[str, Thread] = dict()
+ self.python_dur_events: Dict[str, DurationEvent] = dict()
+ self.cann_dur_events: Dict[str, DurationEvent] = dict()
+ self.ascend_hardware_dur_events: Dict[str, DurationEvent] = dict()
+ self.torch_2_npu_flow_events: Dict[str, FlowEvent] = dict()
+ traces = FileManager.read_json_file(path)
+ self._load_obj(traces)
+
+ def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str:
+ if ts_col not in data.columns.tolist():
+ print("[ERROR] No {} col found in data columns.".format(ts_col))
+ return ""
+ row = data.loc[index_id]
+ timestamp = row[ts_col]
+ flow_event = self.get_torch_2_npu_flow_event(timestamp)
+ if not flow_event.valid():
+ print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern']))
+ return ""
+ flow_event_s_key = flow_event.s_point_ts
+ python_dur_events = self.get_python_dur_events_contain_ts(flow_event_s_key)
+ if not python_dur_events:
+ print("[ERROR] No python dur event found for pattern {}.".format(row['pattern']))
+ return ""
+ # 保持新老版本callstack兼容性
+ if python_dur_events[0].args.get("Call stack"):
+ # 旧版本
+ call_stack_list = python_dur_events[0].args.get("Call stack").split(";")
+ else:
+ python_dur_events.sort(key=lambda e: e.ts)
+ # 新版本
+ call_stack_list = [event.name for event in python_dur_events if event.cat == "python_function"]
+ call_stack = "\n".join(call_stack_list)
+ return call_stack
+
+ def get_torch_2_npu_flow_event(self, end_time) -> FlowEvent:
+ if not self.torch_2_npu_flow_events or not self.torch_2_npu_flow_events.get(end_time):
+ print("[ERROR] Find flow event failed for ts: {}".format(end_time))
+ return FlowEvent()
+ return self.torch_2_npu_flow_events.get(end_time)
+
+ def get_python_dur_events_contain_ts(self, ts) -> List[DurationEvent]:
+ res = []
+ for event in self.python_dur_events.values():
+ if float(event.ts) <= float(ts) <= float(event.ts) + event.dur:
+ res.append(event)
+ return res
+
+ def _load_obj(self, traces):
+ self._load_format(traces)
+ if not self._check_format():
+ print("[ERROR] parse json failed for error format")
+ return
+ self._load_duration_events(traces)
+ self._load_torch_to_npu_flow_events(traces)
+
+ def _check_format(self):
+ # 当前功能只需要这两个process,可扩展
+ check_processes = ['Python', 'Ascend Hardware']
+ for check_process in check_processes:
+ if check_process in self.processes:
+ continue
+ print("[ERROR] {} process not found in json.".format(check_process))
+ return False
+ return True
+
+ # 加载pid, tid头
+ def _load_format(self, traces: List[Dict]):
+ for i, trace in enumerate(traces):
+ if trace.get('name') == 'process_name':
+ if not trace.get('args') or not trace.get('args').get('name') or not trace.get('pid'):
+ continue
+ process = Process(**trace)
+ self.processes[process.hash()] = process
+ if trace.get('name') == 'thread_name':
+ if not trace.get('args') or not trace.get('args').get('name') or not trace.get('tid'):
+ continue
+ thread = Thread(**trace)
+ self.threads[thread.hash()] = thread
+
+ def _load_duration_events(self, traces: List[Dict]):
+ def check_events(_trace):
+ return _trace.get('name') and _trace.get("ts") and _trace.get("dur")
+
+ python_pid = self.processes.get("Python").pid
+ cann_pid = self.processes.get("CANN").pid
+ ascend_hardware_pid = self.processes.get("Ascend Hardware").pid
+ for i, trace in enumerate(traces):
+ if trace.get('ph') != 'X':
+ continue
+ if not check_events(trace):
+ continue
+ event = DurationEvent(**trace)
+ if trace.get('pid') == python_pid:
+ self.python_dur_events[event.hash()] = event
+ elif trace.get('pid') == cann_pid:
+ self.cann_dur_events[event.hash()] = event
+ elif trace.get("pid") == ascend_hardware_pid:
+ self.ascend_hardware_dur_events[event.hash()] = event
+
+ def _load_torch_to_npu_flow_events(self, traces: List[Dict]):
+ def check_events(_trace):
+ return _trace.get('name') and _trace.get("id") and _trace.get("ts")
+
+ flow_events_table_by_id = dict()
+
+ python_pid = self.processes.get("Python")
+ for i, trace in enumerate(traces):
+ if trace.get('ph') != 's' and trace.get('ph') != 'f' and trace.get('pid') != python_pid:
+ continue
+ if not check_events(trace):
+ continue
+ event = flow_events_table_by_id.get(trace.get("id"))
+ if not event:
+ event = FlowEvent(**trace)
+ if trace.get('ph') == 's':
+ event.s_point_ts = trace.get('ts')
+ else:
+ event.e_point_ts = trace.get('ts')
+ flow_events_table_by_id[event.id] = event
+
+ self.torch_2_npu_flow_events = {eve.hash(): eve for eve in flow_events_table_by_id.values()}
diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b9baa32d9423a46bf93d563a6fabbbbb652aaf8
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+from typing import Optional
+from dataclasses import dataclass
+
+from common_func_advisor.constant import Constant
+
+
+@dataclass
+class FineTraceViewData:
+ py_pid: int = -1
+ fp_tid: int = -1
+ bp_tid: int = -1
+ ascend_pid: int = -1
+ min_ts: str = str(sys.maxsize)
+ max_ts: str = "0"
+ hcom_tids: list = None
+ fp_ops: list = None
+ bp_ops: list = None
+ hcom_ops: list = None
+ npu_ops_ts_dur: dict = None
+ torch_to_npu_links: list = None
+
+ def __post_init__(self):
+ self.hcom_tids = self.hcom_tids or []
+ self.fp_ops = self.fp_ops or []
+ self.bp_ops = self.bp_ops or []
+ self.hcom_ops = self.hcom_ops or []
+ self.npu_ops_ts_dur = self.npu_ops_ts_dur or {}
+ self.torch_to_npu_links = self.torch_to_npu_links or []
+
+ def sort(self):
+ self.fp_ops.sort(key=lambda x: x[Constant.TS])
+ self.bp_ops.sort(key=lambda x: x[Constant.TS])
+ self.hcom_ops.sort(key=lambda x: x[Constant.TS])
+ self.torch_to_npu_links.sort(key=lambda x: x[Constant.TS])
+
+
+class TraceViewPreProcessor:
+ """
+ Trace view data preprocess
+ """
+
+ @staticmethod
+ def _is_fp_op(op_name: str) -> bool:
+ """
+ check whether op is fp op
+ """
+ return op_name.startswith(Constant.FP_ATEN_OP) or op_name.startswith(Constant.FP_C10D_OP)
+
+ @staticmethod
+ def _is_fp_data(data: dict, fp_tid: int, py_pid: int) -> bool:
+ """
+ check whether data is valid fp data
+ """
+ return data[Constant.OP_TID] == fp_tid and \
+ Constant.TS in data and Constant.DUR in data and \
+ not data[Constant.OP_NAME].startswith(Constant.STEP_PREFIX) and \
+ data[Constant.PID] == py_pid
+
+ @staticmethod
+ def _is_bp_op(op_name: str) -> bool:
+ """
+ check whether op is bp op
+ """
+ return op_name.startswith(Constant.BP_AUTOGRAD_OP)
+
+ @staticmethod
+ def _is_bp_data(data: dict, bp_tid: int, py_pid: int) -> bool:
+ """
+ check whether data is valid bp data
+ """
+ return data[Constant.OP_TID] == bp_tid and \
+ Constant.TS in data and Constant.DUR in data and \
+ data[Constant.PID] == py_pid
+
+ @staticmethod
+ def _is_torch_to_npu_link(data: dict, fp_tid: int) -> bool:
+ """
+ check whether data is torch to npu link
+ """
+ return Constant.CAT in data and data[Constant.CAT] == Constant.ASYNC_NPU and \
+ data[Constant.PH] == Constant.PH_START and \
+ data[Constant.PID] == fp_tid
+
+ @staticmethod
+ def _is_send_recv_op(op_name: str) -> bool:
+ """
+ check whether op is hcom send or recv op
+ """
+ # eg: hcom_BatchSendRecv__101_0_1
+ p1 = re.compile(r'hcom_\w+SendRecv__\d+')
+ # eg: hcom_send__101_0_1
+ p2 = re.compile(r'hcom_send__\d+')
+ # eg: hcom_receive__101_0_1
+ p3 = re.compile(r'hcom_receive__\d+')
+ return bool(p1.match(op_name)) or bool(p2.match(op_name)) or bool(p3.match(op_name))
+
+ @staticmethod
+ def _is_hcom_op(op_name: str) -> bool:
+ """
+ check whether data is hcom data
+ """
+ return op_name.startswith(Constant.HCOM_OP_PREFIX)
+
+ @staticmethod
+ def _is_python_process(data: dict) -> bool:
+ """
+ check whether data is python process
+ """
+ return Constant.PH in data and data[Constant.PH] == Constant.PH_META and \
+ data[Constant.OP_NAME] == Constant.PROCESS_NAME and \
+ data[Constant.ARGS][Constant.OP_NAME] == Constant.FRAMEWORK_NAME
+
+ @staticmethod
+ def _is_step_op(data: dict) -> bool:
+ """
+ check whether data is step data
+ """
+ return data[Constant.OP_NAME].startswith(Constant.STEP_PREFIX)
+
+ @staticmethod
+ def _is_ascend_process(data: dict) -> bool:
+ """
+ check whether data is ascend process data
+ """
+ return Constant.PH in data and data[Constant.PH] == Constant.PH_META and \
+ data[Constant.OP_NAME] == Constant.PROCESS_NAME and \
+ data[Constant.ARGS][Constant.OP_NAME] == Constant.ASCEND_HARDWARE_NAME
+
+ @staticmethod
+ def _is_npu_op(data: dict, ascend_pid: int) -> bool:
+ """
+ check whether data is npu op
+ """
+ return Constant.PH in data and data[Constant.PH] == Constant.PH_X and \
+ not data[Constant.OP_NAME].isupper() and \
+ data[Constant.PID] == ascend_pid
+
+ def process(self, raw_data: list) -> Optional[FineTraceViewData]:
+ """
+ preprocess raw data
+ """
+ if not raw_data:
+ print("[ERROR] No raw data found in trace view data.")
+ return None
+
+ raw_fp_tids, raw_bp_tids, raw_hcom_tids = set(), set(), set()
+ fine_data = FineTraceViewData()
+
+ # counting fp ops and bp ops tid and ascend pid
+ for data in raw_data:
+ if self._is_fp_op(data[Constant.OP_NAME]):
+ raw_fp_tids.add(data[Constant.OP_TID])
+ elif self._is_bp_op(data[Constant.OP_NAME]):
+ raw_bp_tids.add(data[Constant.OP_TID])
+ elif self._is_send_recv_op(data[Constant.OP_NAME]):
+ fine_data.hcom_ops.append(data)
+ raw_hcom_tids.add(data[Constant.OP_TID])
+ elif self._is_python_process(data):
+ fine_data.py_pid = data[Constant.PID]
+ elif self._is_ascend_process(data):
+ fine_data.ascend_pid = data[Constant.PID]
+
+ # find max and min ts in hcom ops
+ if self._is_hcom_op(data[Constant.OP_NAME]):
+ # for compatibility with old data (ts is float type)
+ ts = data[Constant.TS] if not isinstance(data[Constant.TS], float) else str(data[Constant.TS])
+ fine_data.min_ts = min(fine_data.min_ts, ts)
+ fine_data.max_ts = max(fine_data.max_ts, ts)
+
+ unique_fp_tid = list(raw_fp_tids - raw_bp_tids)
+ unique_bp_tid = list(raw_bp_tids)
+ fine_data.hcom_tids = list(raw_hcom_tids)
+
+ if not unique_fp_tid or not unique_bp_tid:
+ print("[INFO] No fp or bp tid found in trace view data.")
+ else:
+ fine_data.fp_tid, fine_data.bp_tid = unique_fp_tid[0], unique_bp_tid[0]
+
+ # filter fp ops and bp ops and torch_to_npu_links
+ for data in raw_data:
+ if self._is_fp_data(data, fine_data.fp_tid, fine_data.py_pid):
+ fine_data.fp_ops.append(data)
+ elif self._is_bp_data(data, fine_data.bp_tid, fine_data.py_pid):
+ fine_data.bp_ops.append(data)
+ elif self._is_torch_to_npu_link(data, fine_data.fp_tid):
+ fine_data.torch_to_npu_links.append(data)
+ elif self._is_npu_op(data, fine_data.ascend_pid):
+ fine_data.npu_ops_ts_dur[data[Constant.TS]] = data[Constant.DUR]
+
+ fine_data.sort()
+ return fine_data
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/__init__.py b/profiler/advisor_review/advisor_backend/compute_advice/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py b/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..cafbafd8e28c162bc76edb2f77ebd0645fed552f
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from collections import defaultdict
+import os
+
+from advice_base import AdviceBase
+from common_func.file_manager import FileManager
+
+
+class ComputeAdviceBase(AdviceBase):
+ ASCEND_PT = 'ascend_pt'
+ ASCEND_PROFILER_OUTPUT = 'ASCEND_PROFILER_OUTPUT'
+ KERNEL_DETAIL_FILE = "kernel_details.csv"
+ TRACE_VIEW_FILE = "trace_view.json"
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.kernel_details_path = ""
+ self.has_preparse = False
+ self.preparse_data = defaultdict(list)
+ self.call_stack = None
+ self.trace_view_path = ""
+
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+ if not os.path.exists(self.collection_path):
+ print("[ERROR] Path: {} is not exist.".format(self.collection_path))
+ return False
+ if os.path.isdir(self.collection_path) and self.collection_path.endswith("ascend_pt"):
+ self.kernel_details_path = os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT",
+ "kernel_details.csv")
+ if not os.path.exists(self.kernel_details_path):
+ print("[ERROR] kernel_details.csv is not exist in the Path: {}.".format(
+ os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT")))
+ return False
+ elif os.path.isfile(self.collection_path) and os.path.basename(self.collection_path) == "kernel_details.csv":
+ self.kernel_details_path = self.collection_path
+ else:
+ print("[ERROR] Please input ascend_pt or kernel_details.csv")
+ return False
+ print("[INFO] Start to analyse the target file: {}".format(self.kernel_details_path))
+ self.preparse()
+ return True
+
+ def has_callstack(self):
+ if self.call_stack is not None:
+ return self.call_stack
+ profiler_info_json_path = ""
+ for file in os.listdir(self.collection_path):
+ if file.startswith("profiler_info"):
+ profiler_info_json_path = os.path.join(self.collection_path, file)
+ break
+ if not profiler_info_json_path:
+ self.call_stack = False
+ return self.call_stack
+ self.trace_view_path = os.path.join(self.collection_path, self.ASCEND_PROFILER_OUTPUT, "trace_view.json")
+ if not os.path.exists(profiler_info_json_path) or not os.path.exists(self.trace_view_path):
+ self.call_stack = False
+ return self.call_stack
+ info = FileManager.read_json_file(profiler_info_json_path)
+ if not info.get("config") or not info.get("config").get("common_config") \
+ or not info.get("config").get("common_config").get("with_stack"):
+ self.call_stack = False
+ return self.call_stack
+ activities = info.get("config").get("common_config").get("activities")
+ if not activities or "ProfilerActivity.CPU" not in activities:
+ self.call_stack = False
+ return self.call_stack
+ self.call_stack = info.get("config").get("common_config").get("with_stack")
+ return self.call_stack
+
+ @abstractmethod
+ def run(self):
+ """
+ analyze profiling data and advice
+ """
+
+ @abstractmethod
+ def output(self):
+ """
+ output relevant data
+ """
+ self.output_format_data[self.DATA] = self.cur_data
+ self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck
+ self.output_format_data[self.ADVICE] = self.cur_advice
+
+ def preparse(self):
+ if self.has_preparse:
+ return
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c85c14d618ceda199c9c376abc27a3581eed97b8
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+
+import pandas as pd
+import numpy as np
+
+from common_func_advisor.constant import Constant
+from .op_perf import OpPerfFactory
+
+
+class CSVAnalyzer:
+ def __init__(self, path) -> None:
+ self._path = path
+
+ def process(self):
+ df = pd.read_csv(self._path, dtype={"Start Time(us)": str})
+ # 分析是否存在可融合的算子
+ op_type_list = df["Type"].tolist()
+ duration_list = df["Duration(us)"].tolist()
+ start_times = df["Start Time(us)"].tolist()
+ # 去除末尾的\t分隔符
+ start_times = [start_time[:-1] for start_time in start_times]
+ result_list = []
+ for pattern in Constant.PATTERN_DICT.keys():
+ result_list.extend(self.find_all_sub_lists(op_type_list, duration_list, start_times, pattern))
+ data_frame = pd.DataFrame(result_list)
+ data_frame.columns = ["pattern_name", "pattern", "len", "count", "duration sum(us)", "op durations(us)",
+ "index", "first_timestamp"]
+ return data_frame
+
+ @staticmethod
+ def find_all_sub_lists(op_type_list, duration_list, start_times, expect_sub_list):
+ # 创建一个空字典,用来存储子列表和它们的出现次数和起始位置
+ len_sub_list = len(expect_sub_list)
+ expect_sub_list = tuple(expect_sub_list)
+ sublist_dict = {}
+ # 遍历列表,从每个位置开始,取长度为N的子列表
+ for i in range(len(op_type_list) - len_sub_list + 1):
+ sublist = tuple(op_type_list[i:i + len_sub_list])
+ if sublist != expect_sub_list:
+ continue
+ # 如果子列表已经在字典中,就增加它的出现次数,否则就初始化为1
+ if sublist in sublist_dict:
+ # count
+ sublist_dict[sublist][0] += 1
+ # index
+ sublist_dict[sublist][1].append(i)
+ # total duration
+ sublist_dict[sublist][2] += sum(duration_list[i:i + len_sub_list])
+ # duration
+ zip_data = zip(sublist_dict[sublist][3], duration_list[i:i + len_sub_list])
+ sublist_dict[sublist][3] = [a + b for a, b in zip_data]
+ else:
+ sublist_dict[sublist] = [1, [i], sum(duration_list[i:i + len_sub_list]),
+ duration_list[i:i + len_sub_list], len_sub_list, start_times[i]]
+ # 创建一个空列表,用来存储所有重复的子列表
+ repeated_sublists = []
+ for sublist, (count, index, duration_sum, op_durations, sublist_len, first_time) in sublist_dict.items():
+ pattern_name = Constant.PATTERN_DICT.get(sublist, "unknown")
+ op_durations = [round(num, 2) for num in op_durations]
+ repeated_sublists.append([pattern_name, sublist, sublist_len, count,
+ duration_sum, op_durations, index, first_time])
+ if len(sublist_dict) == 0:
+ pattern_name = Constant.PATTERN_DICT.get(expect_sub_list, "unknown")
+ repeated_sublists.append([pattern_name, expect_sub_list, 0, 0, 0, 0, 0, 0])
+ # 返回所有重复的子列表
+ return repeated_sublists
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2a72ffa39bfde1b3e59450c6d76f51d98110d9
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+
+from common_func_advisor.trace_view_json import TraceViewJson
+
+
+class JSONAnalyzer(object):
+ def __init__(self, path):
+ self._path = path
+
+ def get_custom_code(self, data: pd.DataFrame, ts_col: str, output_col: str):
+ trace_json = TraceViewJson(self._path)
+ callstacks = pd.DataFrame(columns=[output_col])
+
+ for i, row in data.iterrows():
+ if ts_col not in data.columns.tolist():
+ print("[ERROR] No {} col found in data columns.".format(ts_col))
+ return callstacks
+ timestamp = row[ts_col]
+ flow_event = trace_json.get_torch_2_npu_flow_event(timestamp)
+ if not flow_event.valid():
+ print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern']))
+ callstacks.loc[i] = ""
+ continue
+ flow_event_s_key = flow_event.s_point_ts
+ python_dur_events = trace_json.get_python_dur_events_contain_ts(flow_event_s_key)
+ if not python_dur_events:
+ print("[ERROR] No python dur event found for pattern {}.".format(row['pattern']))
+ callstacks.loc[i] = ""
+ continue
+ # 保持新老版本callstack兼容性
+ if python_dur_events[0].args.get("Call stack"):
+ # 旧版本
+ callstack = python_dur_events[0].args.get("Call stack").split(";")
+ else:
+ python_dur_events.sort(key=lambda e: e.ts)
+ # 新版本
+ callstack = [event.name for event in python_dur_events if event.cat == "python_function"]
+ callstack_str = "\n".join(callstack)
+ callstacks.loc[i] = callstack_str
+ return callstacks
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bcbed5a75807b57a55787c743cfaaff55a68589
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+from typing import Dict
+
+from common_func_advisor.constant import Constant
+from common_func_advisor.constant import CoreType
+from common_func_advisor.constant import PerfColor
+
+
+class OpPerfFactory:
+ @classmethod
+ def build(cls, op_row: Dict):
+ if op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIV:
+ return VecOpPerf(op_row)
+ elif op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIC:
+ return CubeOpPerf(op_row)
+ else:
+ return OpPerf(op_row)
+
+
+class OpPerf:
+ def __init__(self, op_row: Dict):
+ if "OP Type" in op_row.keys():
+ Constant.update_title()
+ self.row = op_row
+ self.model_name = op_row.get("Model Name")
+ self.model_id = op_row.get("Model ID")
+ self.task_id = op_row.get("Task ID")
+ self.stream_id = op_row.get("Stream ID")
+ self.infer_id = op_row.get("Infer ID")
+ self.op_name = op_row.get("Name")
+ self.op_type = op_row.get("Type")
+ self.task_type = op_row.get("Accelerator Core")
+ self.task_start_time = op_row.get("Start Time(us)")
+ self.task_duration = op_row.get("Duration(us)")
+ self.task_wait_time = op_row.get("Wait Time(us)")
+ self.block_dim = op_row.get("Block Dim")
+ self.mix_block_dim = op_row.get("Mix Block Dim")
+
+ self.hf32_eligible = op_row.get("HF32 Eligible")
+ self.input_shapes = op_row.get("Input Shapes")
+ self.input_data_types = op_row.get("Input Data Types")
+ self.input_formats = op_row.get("Input Formats")
+ self.output_shapes = op_row.get("Output Shapes")
+ self.output_data_types = op_row.get("Output Data Types")
+ self.output_formats = op_row.get("Output Formats")
+ self.context_id = op_row.get("Context ID")
+ self.aicore_time = op_row.get("aicore_time(us)")
+ self.aic_total_cycles = op_row.get("aic_total_cycles")
+
+ self.aic_mac_time = op_row.get("aic_mac_time(us)")
+ self.aic_mac_ratio = op_row.get("aic_mac_ratio")
+ self.aic_scalar_time = op_row.get("aic_scalar_time(us)")
+ self.aic_scalar_ratio = op_row.get("aic_scalar_ratio")
+ self.aic_mte1_time = op_row.get("aic_mte1_time(us)")
+ self.aic_mte1_ratio = op_row.get("aic_mte1_ratio")
+ self.aic_mte2_time = op_row.get("aic_mte2_time(us)")
+ self.aic_mte2_ratio = op_row.get("aic_mte2_ratio")
+ self.aic_fixpipe_time = op_row.get("aic_fixpipe_time(us)")
+ self.aic_fixpipe_ratio = op_row.get("aic_fixpipe_ratio")
+ self.aic_icache_miss_rate = op_row.get("aic_icache_miss_rate")
+ self.aiv_time = op_row.get("aiv_time(us)")
+ self.aiv_total_cycles = op_row.get("aiv_total_cycles")
+ self.aiv_vec_time = op_row.get("aiv_vec_time(us)")
+ self.aiv_vec_ratio = op_row.get("aiv_vec_ratio")
+ self.aiv_scalar_time = op_row.get("aiv_scalar_time(us)")
+ self.aiv_scalar_ratio = op_row.get("aiv_scalar_ratio")
+ self.aiv_mte2_time = op_row.get("aiv_mte2_time(us)")
+
+ self.aiv_mte2_ratio = op_row.get("aiv_mte2_ratio")
+ self.aiv_mte3_time = op_row.get("aiv_mte3_time(us)")
+ self.aiv_mte3_ratio = op_row.get("aiv_mte3_ratio")
+ self.aiv_icache_miss_rate = op_row.get("aiv_icache_miss_rate")
+ self.cube_utilization = op_row.get("cube_utilization( %)")
+
+ @staticmethod
+ def get_dtype_size(dtype_str: str):
+ return Constant.DTYPE_SIZE_MAP.get(dtype_str.lower(), 0)
+
+ @staticmethod
+ def get_element_count(shape: list):
+ return functools.reduce(lambda x, y: int(x) * int(y), shape)
+
+ @staticmethod
+ def shape_to_tuple(shape_str: str) -> tuple:
+ if not isinstance(shape_str, str):
+ return []
+ shape_str = shape_str.strip('"')
+ split_shape = shape_str.strip(';')
+ if not split_shape:
+ return []
+ pairs = split_shape.split(';')
+ shape_result = []
+ for pair in pairs:
+ pair = pair.strip(";")
+ elements = pair.split(',')
+ elements = tuple(int(element) if "" != element else 0 for element in elements)
+ shape_result.append(elements)
+ return tuple(shape_result)
+
+ @staticmethod
+ def dtype_to_tuple(dtypes_str: str) -> tuple:
+ if not isinstance(dtypes_str, str):
+ return []
+ dtypes_str = dtypes_str.strip('"')
+ split_dtypes = dtypes_str.strip(';')
+ if not split_dtypes:
+ return []
+ pairs = split_dtypes.split(';')
+ return tuple(pairs)
+
+ def get_mac_ratio(self):
+ return self.aic_mac_ratio
+
+ def get_size(self, shapes_str, dtypes_str):
+ shapes = self.shape_to_tuple(shapes_str)
+ dtypes = self.dtype_to_tuple(dtypes_str)
+ if len(shapes) > len(dtypes):
+ print(f"[ERROR] The size of shape is greater than that of dtypes.")
+ return 0
+ if len(shapes) < len(dtypes):
+ shapes = list(shapes)
+ shapes.extend([(1,)] * (len(dtypes) - len(shapes)))
+ all_size = 0
+ for index, shape in enumerate(shapes):
+ element_count = self.get_element_count(shape)
+ dtype_size = self.get_dtype_size(dtypes[index])
+ all_size += element_count * dtype_size
+ return all_size
+
+ def get_calc_size(self):
+ # input and output bytes (MB)
+ if not self.input_shapes or not self.output_shapes:
+ print("[ERROR] There is no tensor data, do not assess vector op performance.")
+ return 0
+ intput_size = self.get_size(self.input_shapes, self.input_data_types)
+ output_size = self.get_size(self.output_shapes, self.output_data_types)
+ return (intput_size + output_size) / (Constant.BYTE_UNIT_TRANS * Constant.BYTE_UNIT_TRANS)
+
+ def get_throughput(self):
+ # throughput(GB/s)
+ if not self.task_duration or abs(self.task_duration) < 1e-6:
+ print("[ERROR] There is no task_duration, do not assess vector op performance.")
+ return 0
+ return self.row[Constant.TITLE.SIZE] / Constant.BYTE_UNIT_TRANS / self.task_duration * Constant.UNIT_TRANS * Constant.UNIT_TRANS
+
+ def get_perf_color(self):
+ return PerfColor.WHITE
+
+ def update(self):
+ self.row[Constant.TITLE.SIZE] = self.get_calc_size()
+ self.row[Constant.TITLE.THROUGHPUT] = self.get_throughput()
+ self.row[Constant.TITLE.COLOR] = self.get_perf_color().name
+ return self.row
+
+
+class VecOpPerf(OpPerf):
+ def get_perf_color(self) -> PerfColor:
+ throughput = self.row[Constant.TITLE.THROUGHPUT]
+ op_duration = self.task_duration
+ tp_threshold = Constant.TP_THRESHOLD
+ if throughput == 0:
+ return PerfColor.WHITE
+ if throughput < tp_threshold / 2 and op_duration > 20:
+ return PerfColor.RED
+ elif tp_threshold / 2 <= throughput < tp_threshold:
+ return PerfColor.YELLOW
+ else:
+ return PerfColor.GREEN
+
+
+class CubeOpPerf(OpPerf):
+ def get_perf_color(self) -> PerfColor:
+ aic_mac_ratio = self.get_mac_ratio()
+ if not aic_mac_ratio:
+ print("[WARNING] There is no aic_mac_ratio, do not assess cube op performance.")
+ return PerfColor.WHITE
+ elif aic_mac_ratio < 0.6:
+ return PerfColor.RED
+ elif 0.6 <= aic_mac_ratio < 0.8:
+ return PerfColor.YELLOW
+ else:
+ return PerfColor.GREEN
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd5610bbbbb98d15fbab22bb646b2dd7de36ac3d
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import ABC
+
+import pandas as pd
+
+from compute_advice.compute_advice_base import ComputeAdviceBase
+from compute_advice.npu_fused.csv_analyzer import CSVAnalyzer
+from compute_advice.npu_fused.json_analyzer import JSONAnalyzer
+
+
+class NpuFusedAdvice(ComputeAdviceBase, ABC):
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.cur_data = dict()
+ self.cur_bottleneck = str()
+ self.cur_advice = str()
+ self.kernel_details_path = ""
+ self.call_stack = None
+
+ def run(self):
+ if not self.path_check():
+ return self.output_format_data
+ self.process()
+ self.output()
+ return self.output_format_data
+
+ def process(self):
+ csv_analyzer = CSVAnalyzer(self.kernel_details_path)
+ all_pattern_data = csv_analyzer.process()
+ all_pattern_data = all_pattern_data.sort_values(by='duration sum(us)', ascending=False)
+ filter_data = all_pattern_data.get(all_pattern_data.get("duration sum(us)", 0) > 0)
+ if not self.has_callstack():
+ print("[Warning] No call stack info found, advice will be incomplete")
+ self.cur_data = filter_data
+ else:
+ json_analyzer = JSONAnalyzer(self.trace_view_path)
+ custom_code = json_analyzer.get_custom_code(filter_data, "first_timestamp", "custom code")
+ self.cur_data = pd.concat([filter_data, custom_code], axis=1)
+ op_num = len(self.cur_data.index)
+ op_dur = filter_data["duration sum(us)"].sum()
+ if op_num > 0:
+ index = 0
+ self.cur_bottleneck = f"The computing time of fusable op is {round(op_dur, 2)} ms."
+ self.cur_advice = ""
+ for _, row in self.cur_data.iterrows():
+ advice = f"Advice {index}:\n"
+ cur_op = "[" + ", ".join(row.loc["pattern"]) + "]"
+ npu_fused_op = row.loc["pattern_name"]
+ advice += f"Replace {cur_op} with {npu_fused_op}. "
+ if self.call_stack:
+ advice += f"This pattern first happened in: \n{row['custom code']}"
+ if index != op_num - 1:
+ advice += "\n"
+ index += 1
+ self.cur_advice += advice
diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..caff1c792c2171c33a4dd876b0741d6c215c5766
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+import multiprocessing
+
+import pandas as pd
+
+from compute_advice.compute_advice_base import ComputeAdviceBase
+from compute_advice.npu_fused.op_perf import OpPerfFactory
+from common_func_advisor.constant import Constant
+from common_func_advisor.constant import PerfColor
+from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson
+
+
+class NpuSlowAdvice(ComputeAdviceBase, ABC):
+ OP_PERF_SHEET = "op_perf"
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.kernel_details_path = ""
+ self.data = pd.DataFrame()
+
+ @staticmethod
+ def save_to_excel(data: pd.DataFrame, file_path: str) -> None:
+ writer = pd.ExcelWriter(file_path, engine="xlsxwriter", mode="w")
+ data.index.name = Constant.TITLE.INDEX
+ data.to_excel(writer, index=True, sheet_name=NpuSlowAdvice.OP_PERF_SHEET)
+ NpuSlowAdvice.color_sheet(data, writer.book, writer.sheets[NpuSlowAdvice.OP_PERF_SHEET])
+ writer.sheets[NpuSlowAdvice.OP_PERF_SHEET].freeze_panes = "A2"
+ writer.close()
+
+ @staticmethod
+ def color_sheet(data: pd.DataFrame, workbook, worksheet):
+ color_rgb = {
+ PerfColor.GREEN.name: workbook.add_format({'bg_color': '#C6EFCE'}),
+ PerfColor.YELLOW.name: workbook.add_format({'bg_color': '#FFEB9C'}),
+ PerfColor.RED.name: workbook.add_format({'bg_color': '#FFC7CE'}),
+ }
+ for row in data.iterrows():
+ color = row[1][Constant.TITLE.COLOR]
+ fill_format = color_rgb.get(color)
+ if not fill_format:
+ continue
+ worksheet.set_row(row[0] + 1, None, fill_format)
+
+ @staticmethod
+ def update_op_row(row: tuple):
+ return OpPerfFactory.build(row[1]).update()
+
+ def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str:
+ if not self.has_callstack():
+ print("There is no call stack info, please set 'with_stack=True'")
+ return ""
+ trace_json = TraceViewJson(self.trace_view_path)
+ return trace_json.get_call_stack(data, index_id, ts_col)
+
+ def run(self):
+ if not self.path_check():
+ return self.data
+ self.process()
+ return self.data
+
+ def process(self):
+ self.data = pd.read_csv(self.kernel_details_path, dtype={"Start Time(us)": str})
+ # 去除末尾的\t分隔符
+ self.data["Start Time(us)"] = self.data["Start Time(us)"].apply(lambda x: x[:-1])
+ pool = multiprocessing.Pool(multiprocessing.cpu_count())
+ result = pool.map(self.update_op_row, self.data.iterrows())
+ pool.close()
+ self.data = pd.DataFrame(result)
diff --git a/profiler/advisor_review/advisor_backend/interface.py b/profiler/advisor_review/advisor_backend/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e20c26d4d7bb000b20c28439b28ddf4811f057f
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/interface.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+
+sys.path.append(
+ os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "advisor_backend"))
+sys.path.append(
+ os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "compare_tools"))
+sys.path.append(
+ os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "cluster_analyse"))
+from common_func_advisor.constant import Constant
+from advisor_backend.advice_factory.cluster_advice_factory import ClusterAdviceFactory
+from advisor_backend.advice_factory.compute_advice_factory import ComputeAdviceFactory
+from advisor_backend.advice_factory.timeline_advice_factory import TimelineAdviceFactory
+from advisor_backend.advice_factory.overall_advice_factory import OverallAdviceFactory
+
+
+class Interface:
+ def __init__(self, collection_path: str):
+ self.collection_path = os.path.realpath(collection_path)
+ self._factory_controller = FactoryController(collection_path)
+
+ def get_data(self: any, mode: str, advice: str, **kwargs):
+ if len(mode) > Constant.MAX_INPUT_MODE_LEN or len(advice) > Constant.MAX_INPUT_ADVICE_LEN:
+ msg = '[ERROR]Input Mode is illegal.'
+ raise RuntimeError(msg)
+ factory = self._factory_controller.create_advice_factory(mode, kwargs.get("input_path", ""))
+ return factory.produce_advice(advice, kwargs)
+
+
+class FactoryController:
+ FACTORY_LIB = {
+ Constant.CLUSTER: ClusterAdviceFactory,
+ Constant.COMPUTE: ComputeAdviceFactory,
+ Constant.TIMELINE: TimelineAdviceFactory,
+ Constant.OVERALL: OverallAdviceFactory
+ }
+
+ def __init__(self, collection_path: str):
+ self.collection_path = os.path.realpath(collection_path)
+ self.temp_input_path = None
+
+ def create_advice_factory(self, mode: str, input_path: str):
+ collection_path = input_path if input_path else self.collection_path
+ return self.FACTORY_LIB.get(mode)(collection_path)
+
+
+if __name__ == "__main__":
+ Interface()
diff --git a/profiler/advisor_review/advisor_backend/overall_advice/__init__.py b/profiler/advisor_review/advisor_backend/overall_advice/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5bfc351f2820ac8d797798fd959577da8062ea4
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from advisor_backend.advice_base import AdviceBase
+from compare_backend.utils.constant import Constant
+from compare_interface.comparison_interface import ComparisonInterface
+
+
+class OverallSummaryAdvice(AdviceBase):
+ advice_map = {
+ "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.",
+ "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.",
+ "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule."
+ }
+ time_name_map = {
+ "Computing Time": "computing",
+ "Uncovered Communication Time": "communication",
+ "Free Time": "free",
+ 'Cube Time(Num)': 'Cube Time',
+ 'Vector Time(Num)': 'Vector Time',
+ 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)',
+ 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)',
+ 'Other Time': "Other Computing Time",
+ 'SDMA Time(Num)': 'SDMA Time'
+ }
+ performance_time_dict = {
+ "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)',
+ 'Flash Attention Time(Backward)(Num)', 'Other Time'],
+ "Uncovered Communication Time(Wait Time)": [],
+ "Free Time": ['SDMA Time(Num)']
+ }
+
+ def __init__(self, collection_path: str, kwargs: dict):
+ super().__init__(collection_path)
+ self.base_collection_path = kwargs.get("base_collection_path", "")
+ self._has_base_collection = False
+ self._is_minimal_profiling = False
+ self.cur_data = {}
+ self.cur_bottleneck = {}
+ self.cur_advices = ""
+ self._headers = []
+ self._base_data = []
+ self._comparison_data = []
+
+ @staticmethod
+ def split_duration_and_num(time_value: str) -> tuple:
+ split_data = time_value.split("s") # time value example: 0.229s(1756)
+ duration, num = 0.0, None
+ if len(split_data) >= 2:
+ try:
+ num = int(split_data[1].strip("()"))
+ except ValueError:
+ pass
+ if len(split_data) >= 1:
+ try:
+ duration = float(split_data[0])
+ except ValueError:
+ print(f"[WARNING] Invalid time value: {time_value}.")
+ return duration, num
+
+ @staticmethod
+ def calculate_ratio(dividend, divisor):
+ if not divisor:
+ return float("inf")
+ return dividend / divisor
+
+ def run(self):
+ if self.path_check():
+ self.process()
+ self.output()
+ self.identify_bottleneck()
+ return self.output_format_data
+
+ def path_check(self):
+ if self.base_collection_path:
+ if os.path.exists(self.base_collection_path):
+ self._has_base_collection = True
+ else:
+ print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.")
+ return os.path.exists(self.collection_path)
+
+ def process(self):
+ base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path
+ result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE)
+ for data in result_data.values():
+ self._headers = data.get("headers", [])
+ rows = data.get("rows", [])
+ if len(rows) == 2:
+ self._base_data = rows[0]
+ self._comparison_data = rows[1]
+ if not self._headers or not self._comparison_data:
+ return
+ self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers
+ if self._has_base_collection:
+ self.cur_data["comparison_result"] = result_data
+ time_category_dict = {}
+ for time_category, time_list in self.performance_time_dict.items():
+ time_value = self.get_time_value(time_category, self._comparison_data)
+ if time_value == Constant.INVALID_VALUE:
+ continue
+ duration, _ = self.split_duration_and_num(time_value)
+ time_category = time_category.split("(")[0]
+ time_category_dict[time_category] = duration
+ self.get_sub_category_time(time_category, time_list, duration)
+ self.cur_data["overall_data"] = time_category_dict
+
+ def get_time_value(self, header_name: str, data_list: list):
+ try:
+ data_index = self._headers.index(header_name)
+ except ValueError:
+ return Constant.INVALID_VALUE
+ try:
+ time_value = data_list[data_index]
+ except IndexError:
+ return Constant.INVALID_VALUE
+ return time_value
+
+ def get_sub_category_time(self, category: str, time_list: list, total_duration: float):
+ sub_time_dict = {}
+ for time_name in time_list:
+ time_value = self.get_time_value(time_name, self._comparison_data)
+ if time_value == Constant.INVALID_VALUE:
+ continue
+ sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, ""))
+ duration, num = self.split_duration_and_num(time_value)
+ sub_time_dict.setdefault(f"Duration(s)", []).append(duration)
+ sub_time_dict.setdefault(f"Duration Ratio", []).append(
+ "{:.2%}".format(self.calculate_ratio(duration, total_duration)))
+ sub_time_dict.setdefault(f"Kernel Number", []).append(num)
+ self.cur_data[self.time_name_map.get(category)] = sub_time_dict
+
+ def identify_bottleneck(self):
+ overall_data = self.cur_data.get("overall_data")
+ if not overall_data:
+ return
+ e2e_time = '%.3f' % sum([data for data in overall_data.values()])
+ overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n"
+ comparison_bottleneck = ""
+ for time_type, time_value in overall_data.items():
+ # add subtype time bottleneck
+ advice = self.advice_map.get(time_type, "")
+ self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n{advice}"
+ # add overall bottleneck
+ overall_bottleneck += f" -- {time_type} is {time_value}s\n"
+ if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value,
+ e2e_time) > 0.1:
+ overall_bottleneck += "percentage of free time exceed the threshold 10%."
+ if not self._has_base_collection:
+ continue
+ # add comparison bottleneck
+ time_type_origin = "Uncovered Communication Time(Wait Time)" \
+ if time_type == "Uncovered Communication Time" else time_type
+ base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data))
+ if time_value > base_duration:
+ ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration))
+ comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n"
+ self.cur_bottleneck["overall_data"] = overall_bottleneck
+ self.cur_bottleneck["comparison_result"] = comparison_bottleneck
+
+ def output(self):
+ self.output_format_data[self.DATA] = self.cur_data
+ self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck
+ self.output_format_data[self.ADVICE] = self.cur_advices
diff --git a/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py b/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py b/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py
new file mode 100644
index 0000000000000000000000000000000000000000..b108fc77a3f3408d48c79ce6b542f98427d88b0b
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ClusterStepTraceTimeBean:
+ STEP = "Step"
+ TYPE = "Type"
+ INDEX = "Index"
+ COMPUTING = "Computing"
+ COMMUNICATION = "Communication(Not Overlapped)"
+ FREE = "Free"
+
+ def __init__(self, data: dict):
+ self._data = data
+
+ @property
+ def step(self) -> str:
+ return self._data.get(self.STEP, '')
+
+ @property
+ def type(self) -> str:
+ return self._data.get(self.TYPE, '')
+
+ @property
+ def index(self) -> int:
+ try:
+ return int(self._data.get(self.INDEX))
+ except ValueError as e:
+ msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Index'."
+ raise ValueError(msg) from e
+
+ @property
+ def compute(self) -> float:
+ try:
+ return float(self._data.get(self.COMPUTING, ''))
+ except ValueError as e:
+ msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Computing'."
+ raise ValueError(msg) from e
+
+ @property
+ def communication(self) -> float:
+ try:
+ return float(self._data.get(self.COMMUNICATION, ''))
+ except ValueError as e:
+ msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Communication'."
+ raise ValueError(msg) from e
+
+ @property
+ def free(self) -> float:
+ try:
+ return float(self._data.get(self.FREE, ''))
+ except ValueError as e:
+ msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Free'."
+ raise ValueError(msg) from e
+
diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py b/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py b/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e492b2156c6faee6c023206f3cfc4f852eeb547
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from decimal import Decimal
+from common_func_advisor.constant import Constant
+from timeline_advice.timeline_advice_base import TimelineAdviceBase
+
+
+class OpScheduleAdvice(TimelineAdviceBase):
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.cur_data = list()
+ self.cur_bottleneck = str()
+ self.cur_advice = str()
+
+ def run(self):
+ if not self.path_check():
+ return self.output_format_data
+ self.preparse()
+ self.process()
+ self.output()
+ return self.output_format_data
+
+ def process(self):
+ cpt_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CPT]
+ free_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_FREE]
+ if not cpt_data or not free_data:
+ print("[ERROR] Fail to find Overlap data.")
+ return
+
+ op_dur = [entry.get("dur", 0) for entry in cpt_data]
+ op_free = [0.0] * len(cpt_data)
+ merge_data = list()
+ merge_data.extend(cpt_data)
+ merge_data.extend(free_data)
+ merge_data.sort(key=lambda x : Decimal(x.get("ts")))
+ idx = free_idx = 0
+ while idx < len(merge_data) and free_idx < len(op_free):
+ entry = merge_data[idx]
+ entry_name = entry.get("name")
+ if entry_name == 'Free':
+ op_free[free_idx] = merge_data[idx].get('dur')
+ elif entry_name == 'Computing':
+ free_idx += 1
+ idx += 1
+ self.cur_data.append(op_dur)
+ self.cur_data.append(op_free)
+ free_ratio, cpt_ratio, _ = self.get_ratio()
+ if free_ratio < 0.2:
+ return
+ self.cur_bottleneck = f"NPU Utilication: {round(free_ratio * 100, 2)}%, " \
+ f"NPU Free Utilization: {round(cpt_ratio * 100, 2)}%."
+ if len(self.preparse_data[self.PREPARSE_TYPE.SYNCHRONIZE]) > 1:
+ self.cur_advice = f"Device synchronize {len(self.preparse_data[self.PREPARSE_TYPE.SYNCHRONIZE])} times, " \
+ "try to reduce synchronization statements to alleviate the bottleneck of operator delivery.\n"
+ small_op_num = self.small_op_block(op_free, op_dur)
+ small_op_ratio = small_op_num / len(op_dur) if op_dur else 0.0
+ if small_op_ratio > Constant.SMALL_OP_NUM_RATIO:
+ self.cur_advice += "There are too many small operators, you can increase the batch size appropriately."
+
+ def small_op_block(self, op_frees, op_durs):
+ small_op_num = 0
+ for op_free, op_dur in zip(op_frees, op_durs):
+ if op_free > op_dur * Constant.SMALL_OP_DUR_RATIO:
+ small_op_num += 1
+ return small_op_num
+
+ def get_ratio(self):
+ cpt_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CPT]
+ free_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_FREE]
+ cmu_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CMU]
+ cpt_time = sum([x.get("dur", 0) for x in cpt_data])
+ free_time = sum([x.get("dur", 0) for x in free_data])
+ cmu_time = sum([x.get("dur", 0) for x in cmu_data])
+ total_time = cpt_time + free_time + cmu_time
+ if total_time > 0.0:
+ return (free_time / total_time, cpt_time / total_time, cmu_time / total_time)
+ return (0.0, 0.0, 0.0)
diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py b/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py
new file mode 100644
index 0000000000000000000000000000000000000000..dee2e7ba563d0d00b4459333dffb4099dee9240a
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from timeline_advice.timeline_advice_base import TimelineAdviceBase
+
+
+class OptimizerAdvice(TimelineAdviceBase):
+ OPTIMIZER_MAP = {
+ "Optimizer.step#SGD.step": "torch_npu.optim.NpuFusedSGD",
+ "Optimizer.step#Adadelta.step": "torch_npu.optim.NpuFusedAdadelta",
+ "Optimizer.step#Lamb.step": "torch_npu.optim.NpuFusedLamb",
+ "Optimizer.step#Adam.step": "torch_npu.optim.NpuFusedAdam",
+ "Optimizer.step#AdamW.step": "torch_npu.optim.NpuFusedAdamW",
+ "Optimizer.step#AdamP.step": "torch_npu.optim.NpuFusedAdamP",
+ "Optimizer.step#BertAdam.step": "torch_npu.optim.NpuFusedBertAdam",
+ "Optimizer.step#RMSprop.step": "torch_npu.optim.NpuFusedRMSprop",
+ "Optimizer.step#RMSpropTF.step": "torch_npu.optim.NpuFusedRMSpropTF",
+ }
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.cur_data = list()
+ self.cur_bottleneck = str()
+ self.cur_advice = str()
+
+ def run(self):
+ if not self.path_check():
+ return self.output_format_data
+ self.preparse()
+ self.process()
+ self.output()
+ return self.output_format_data
+
+ def process(self):
+ if not self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER]:
+ return
+
+ self.cur_data = list(set([entry.get("name", None) for entry in self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER]]))
+ for index, opt_name in enumerate(self.cur_data):
+ self.cur_advice += f"You can choose {self.OPTIMIZER_MAP.get(opt_name)} to replace the current Optimizer: {opt_name}."
+ if index != len(self.cur_data) - 1:
+ self.cur_advice += "\n"
+ self.cur_bottleneck = self.cur_advice
diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py b/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c7ac96cd22673741accd6bb2abb463566a2e652
--- /dev/null
+++ b/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from collections import defaultdict
+import json
+import os
+
+from advice_base import AdviceBase
+from common_func.file_manager import FileManager
+
+
+class TimelineAdviceBase(AdviceBase):
+ class PREPARSE_TYPE:
+ OPTIMIZER = 0
+ STEP = 1
+ OVERLAP_CPT = 2
+ OVERLAP_FREE = 3
+ OVERLAP_CMU = 4
+ ENQUEUE = 5
+ DEQUEUE = 6
+ HOST_TO_DEVICE = 7
+ SYNCHRONIZE = 8
+
+ def __init__(self, collection_path: str):
+ super().__init__(collection_path)
+ self.trace_view_path = ""
+ self.has_preparse = False
+ self.preparse_data = defaultdict(list)
+ self.entry_map = {
+ 'Computing': self.PREPARSE_TYPE.OVERLAP_CPT,
+ 'Free': self.PREPARSE_TYPE.OVERLAP_FREE,
+ 'AscendCL@aclrtSynchronizeDevice': self.PREPARSE_TYPE.SYNCHRONIZE
+ }
+
+ def path_check(self):
+ """
+ check whether input path is valid
+ """
+ if not os.path.exists(self.collection_path):
+ print("[ERROR] Path: {} is not exist.".format(self.collection_path))
+ return False
+ if os.path.isdir(self.collection_path) and self.collection_path.endswith("ascend_pt"):
+ self.trace_view_path = os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT", "trace_view.json")
+ if not os.path.exists(self.trace_view_path):
+ print("[ERROR] trace_view.json is not exist in the Path: {}.".format(os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT")))
+ return False
+ elif os.path.isfile(self.collection_path) and os.path.basename(self.collection_path) == "trace_view.json":
+ self.trace_view_path = self.collection_path
+ else:
+ print("[ERROR] Please input ascend_pt or trace_view.json.")
+ return False
+ print("[INFO] Start to analyse the target file: {}".format(self.trace_view_path))
+ return True
+
+ @abstractmethod
+ def run(self):
+ """
+ analyze profiling data and advice
+ """
+
+ @abstractmethod
+ def output(self):
+ """
+ output relevant data
+ """
+ self.output_format_data[self.DATA] = self.cur_data
+ self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck
+ self.output_format_data[self.ADVICE] = self.cur_advice
+
+ def preparse(self):
+ if self.has_preparse:
+ return
+ json_reader = FileManager.read_json_file(self.trace_view_path)
+ if not isinstance(json_reader, list):
+ return
+ for entry in json_reader:
+ name = entry.get("name", None)
+ if not name:
+ continue
+ if name.startswith("Optimizer.step#") and name.endswith(".step"):
+ self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER].append(entry)
+ elif name.startswith("ProfilerStep#"):
+ self.preparse_data[self.PREPARSE_TYPE.STEP].append(entry)
+ elif name in self.entry_map:
+ self.preparse_data[self.entry_map[name]].append(entry)
+ self.has_preparse = True
diff --git a/profiler/advisor_review/analyzer/__init__.py b/profiler/advisor_review/analyzer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/base_analyzer.py b/profiler/advisor_review/analyzer/base_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f4bd3202cd2071088f25564a7d4b14144a34826
--- /dev/null
+++ b/profiler/advisor_review/analyzer/base_analyzer.py
@@ -0,0 +1,94 @@
+import logging
+from functools import wraps
+from typing import Dict, List, Union
+from abc import abstractmethod, ABCMeta
+
+from profiler.advisor.common import constant
+from profiler.advisor.common.version_control import VersionControl
+from profiler.advisor.dataset.dataset import Dataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.display.html.render import HTMLRender
+
+logger = logging.getLogger()
+
+
+class BaseAnalyzer(VersionControl, metaclass=ABCMeta):
+ _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION
+
+ dataset_cls_list = []
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs):
+ self.n_processes = n_processes
+ self.cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION)
+ self.torch_version = kwargs.get("torch_version", constant.DEFAULT_TORCH_VERSION)
+ self.html_render = HTMLRender()
+ self.collection_path = collection_path
+ self.kwargs = kwargs
+ self.dataset_list: Dict[str, List[Dataset]] = {}
+ self.init_dataset_list()
+ self.result = OptimizeResult()
+ self.record_list: Dict[str, List] = {}
+
+ @classmethod
+ def check_data(cls, data_list: tuple):
+ """
+ check if all data in data list is contained
+ :param data_list: data list to check
+ :return: func ptr if check success
+ """
+
+ def decorate(func):
+
+ @wraps(func)
+ def wrapper(self, **kwargs):
+ data = self.dataset_list
+ if data is None:
+ return None
+ for data_key in data_list:
+ if data_key not in data:
+ return None
+
+ logger.info("Enable analysis %s with %s", self.__class__.__name__, ",".join(data_list))
+ return func(self)
+
+ return wrapper
+
+ return decorate
+
+ @abstractmethod
+ def optimize(self, **kwargs):
+ pass
+
+ @abstractmethod
+ def make_record(self):
+ pass
+
+ @abstractmethod
+ def make_render(self):
+ pass
+
+ def init_dataset_list(self)->None:
+ dataset_cls_list = self.dataset_cls_list
+ if len(dataset_cls_list) == 0:
+ logger.warning(f"Analyser: %s don't rely on any dataset!", self.__class__.__name__)
+ return
+
+ for dataset_cls in dataset_cls_list:
+ if dataset_cls and callable(dataset_cls):
+ dataset = dataset_cls(collection_path=self.collection_path, data=self.dataset_list, **self.kwargs)
+ key = dataset_cls.get_key()
+ if key not in self.dataset_list:
+ self.dataset_list[key] = []
+ self.dataset_list[key].append(dataset)
+
+ @staticmethod
+ def get_first_data_by_key(data, key) -> Union[Dataset, None]:
+ """
+ get the first member from data with key
+ :param data: input data
+ :param key: data key
+ :return: the first dataset in dataset list
+ """
+ if key in data and len(data[key]) > 0:
+ return data[key][0]
+ return None
diff --git a/profiler/advisor_review/analyzer/cluster/__init__.py b/profiler/advisor_review/analyzer/cluster/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py b/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py
new file mode 100644
index 0000000000000000000000000000000000000000..846b79a50f31abb8445a0e5c2e82aaaf3c8ee23d
--- /dev/null
+++ b/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Dict, List
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.common import constant
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet
+
+
+class SlowLinkAnalyzer(BaseAnalyzer):
+ RDMA_TIME_MS = "RDMA time(ms)"
+ RDMA_SIZE_MB = "RDMA size(mb)"
+ SDMA_TIME_MS = "SDMA time(ms)"
+ SDMA_SIZE_MB = "SDMA size(mb)"
+ RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)"
+ SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)"
+ COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info"
+ TRANSIT_TIME = "Transit Time(ms)"
+ TRANSIT_SIZE = "Transit Size(MB)"
+ SDMA = "SDMA"
+ RDMA = "RDMA"
+ SLOW_LINK_ANALYSIS = "slow_link_analysis"
+ dataset_cls_list = [ClusterCommunicationDataSet]
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs):
+ super().__init__(collection_path, n_processes, **kwargs)
+ key = ClusterCommunicationDataSet.get_key()
+ self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key)
+ self.rank_bw_dict = self.communication_data_class.get_data()
+ self.result = OptimizeResult()
+ self.bottelneck = ''
+ self.suggestion = ''
+ self.format_datas = []
+
+ def optimize(self, **kwargs):
+ if self.rank_bw_dict is None:
+ print("slow_link 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹, \
+ 如不关心这类数据请忽略")
+ return self.result
+ self.process()
+ self.format_datas = self.format_details()
+ self.make_record()
+ self.make_render()
+ return self.result
+
+ def process(self):
+ if self.rank_bw_dict:
+ self.produce_bottleneck(self.RDMA_BANDWIDTH)
+ self.produce_bottleneck(self.SDMA_BANDWIDTH)
+
+ def produce_bottleneck(self, link_type: str):
+ data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()]
+ avg_bw = round(sum(data_list) / len(data_list), 3)
+ if avg_bw == 0:
+ return
+ self.bottelneck += f'{link_type}: \n' \
+ f' The average is {avg_bw}, \n' \
+ f' while the maximum is {round(max(data_list), 3)}GB/s \n' \
+ f' and the minimum is {round(min(data_list), 3)}GB/s. \n' \
+ f' the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n'
+
+ def format_details(self):
+ if not self.rank_bw_dict:
+ return {
+ "headers": [],
+ "data": []
+ }
+
+ details_dict = {}
+ headers = list({k for rank_bw_value in self.rank_bw_dict.values() for k in rank_bw_value.keys()})
+ headers.sort()
+ data_list = [[rank_id] + [rank_bw.get(k, 0) for k in headers] for rank_id, rank_bw in self.rank_bw_dict.items()]
+ data_list.sort(key = lambda x: x[0]) # 按rank_id排序
+
+ details_dict["headers"] = ["rank_id"] + headers
+ details_dict["data"] = data_list
+
+ return details_dict
+
+ def make_record(self):
+ """
+ make record for what and how to optimize
+ """
+ optimization_item = OptimizeItem(
+ SlowLinkAnalyzer.SLOW_LINK_ANALYSIS,
+ self.bottelneck,
+ self.suggestion
+ )
+ self.result.add(OptimizeRecord(optimization_item))
+
+ for i, data in enumerate(self.format_datas["data"]):
+ self.result.add_detail(SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, self.format_datas["headers"], data)
+
+ def make_render(self):
+ result_for_html = {
+ "Description" : self.bottelneck,
+ "suggestion" : self.suggestion,
+ "details" : [self.format_datas]
+ }
+
+ self.html_render.render_template(key="cluster",
+ title=SlowLinkAnalyzer.SLOW_LINK_ANALYSIS,
+ template_dir="templates",
+ template_name="cluster_analysis.html",
+ cann_version=self.cann_version,
+ torch_version=self.torch_version,
+ result=result_for_html)
\ No newline at end of file
diff --git a/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py
new file mode 100644
index 0000000000000000000000000000000000000000..4215b514a215a2a350571746ff9cb90c3c9956eb
--- /dev/null
+++ b/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import Dict, List
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.common import constant
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataSet
+
+
+class SlowRankAnalyzer(BaseAnalyzer):
+ SLOW_RANK_ANALYSIS = "slow_rank_analysis"
+ RANK = "rank"
+ RATIO_THRESHOLD = 0.05
+ BOTTLENECK_LIST = ['Computing', 'Communication', "Free"]
+ dataset_cls_list = [ClusterStepTraceTimeDataSet]
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs):
+ super().__init__(collection_path, n_processes, **kwargs)
+ key = ClusterStepTraceTimeDataSet.get_key()
+ self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key)
+ self.step_trace_dict = self.step_trace_class.get_data()
+ self.result = OptimizeResult()
+ self.bottelneck = ''
+ self.suggestion = ''
+ self.format_datas = []
+
+ def optimize(self, **kwargs):
+ if self.step_trace_dict is None:
+ print("slow_rank 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹 \
+ 如不关心这类数据请忽略")
+ return self.result
+ self.process()
+ self.format_datas = self.format_details()
+ self.make_record()
+ self.make_render()
+ return self.result
+
+ def process(self):
+ total_time_list = [sum(data_tuple) for rank_id, data_tuple in self.step_trace_dict.items()]
+ if total_time_list:
+ mean_total_time = sum(total_time_list) / len(total_time_list)
+ for i in range(len(self.BOTTLENECK_LIST)):
+ self.produce_bottleneck(self.step_trace_dict, i, mean_total_time)
+
+ def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float):
+ data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()]
+ max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time)
+ if max_ratio > self.RATIO_THRESHOLD:
+ self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} \n' \
+ f' has some issues in the cluster, \n' \
+ f' because the max difference of {self.BOTTLENECK_LIST[produce_type]} time \n' \
+ f' has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n'
+
+ def make_record(self):
+ """
+ make record for what and how to optimize
+ """
+ optimization_item = OptimizeItem(
+ SlowRankAnalyzer.SLOW_RANK_ANALYSIS,
+ self.bottelneck,
+ self.suggestion
+ )
+ self.result.add(OptimizeRecord(optimization_item))
+ for i, data in enumerate(self.format_datas["data"]):
+ self.result.add_detail(SlowRankAnalyzer.SLOW_RANK_ANALYSIS, self.format_datas["headers"], data)
+
+ def format_details(self):
+ details_dict = {}
+ headers = ["rank_id", "compute", "communication", "free"]
+ data_list = []
+ for key,value in self.step_trace_dict.items():
+ data_list.append([key] + value)
+ details_dict["headers"] = headers
+ details_dict["data"] = data_list
+ return details_dict
+
+ def make_render(self):
+ result_for_html = {
+ "Description" : self.bottelneck,
+ "suggestion" : self.suggestion,
+ "details" : [self.format_datas]
+ }
+
+ self.html_render.render_template(key="cluster",
+ title=SlowRankAnalyzer.SLOW_RANK_ANALYSIS,
+ template_dir="templates",
+ template_name="cluster_analysis.html",
+ cann_version=self.cann_version,
+ torch_version=self.torch_version,
+ result=result_for_html)
+
+ @staticmethod
+ def compute_max_gap_ratio(data: list, mean: float):
+ if mean == 0:
+ return 0
+ else:
+ return (max(data) - min(data)) / mean
diff --git a/profiler/advisor_review/analyzer/communication/__init__.py b/profiler/advisor_review/analyzer/communication/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/communication/bandwidth/__init__.py b/profiler/advisor_review/analyzer/communication/bandwidth/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/communication/environment/__init__.py b/profiler/advisor_review/analyzer/communication/environment/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/computation/__init__.py b/profiler/advisor_review/analyzer/computation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/computation/aicpu/__init__.py b/profiler/advisor_review/analyzer/computation/aicpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eca1c6c0278349cf4068544d2a53d8de7f0d5e1
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py
@@ -0,0 +1,278 @@
+import copy
+import os
+from functools import partial
+from typing import List, Dict, Optional
+
+import yaml
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker, logger
+from profiler.advisor.analyzer.schedule.fusion_ops.timeline_api_stack_checker import OpStackFinder
+from profiler.advisor.common import constant
+from profiler.advisor.dataset.dataset import Dataset
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+
+
+class AicpuChecker(OperatorChecker):
+ _CHECKER = "aicpu operator"
+ _PROBLEM = "AICPU operator"
+ _MIN_TASK_DURATION = 20
+ _description = f"Some operators and task duration exceed {_MIN_TASK_DURATION} us, such as :\n"
+ _SUGGESTION: List[str] = ["Modify code to avoid aicpu operator"]
+ STACK_INFO_ITEMS = "stack_info"
+ SUGGESTION_INFO_ITEMS = "suggestions"
+ _ITEMS = [
+ "op_name", "op_type", "task_duration", "input_shapes", "input_data_types", "input_formats", "output_shapes",
+ "output_data_types", "output_formats"
+ ]
+
+ def __init__(self, cann_version):
+ super(AicpuChecker, self).__init__(cann_version=cann_version)
+ self.aicpu_rules: Dict = {}
+ self.aicpu_checker: Dict = {}
+ self.load_aicpu_rules()
+
+ def _check_data(self, profiling_data: ProfilingDataset) -> bool:
+ if not self._check_summary(profiling_data):
+ return False
+ return True
+
+ def _check_operator(self, op_info) -> bool:
+ return op_info.task_type == constant.AI_CPU
+
+ def load_aicpu_rules(self, rule_path="rules/aicpu_rules.yaml") -> Dict:
+ if not os.path.isabs(rule_path):
+ rule_path = os.path.join(os.path.dirname(__file__),
+ "../../../", rule_path)
+
+ if not os.path.exists(rule_path):
+ logger.warning("Skip analyze aicpu issues, because %s does not exist.", rule_path)
+ return {}
+ with open(rule_path, 'r') as f:
+ self.aicpu_rules = yaml.safe_load(f)
+ self.filter_aicpu_rules(self.aicpu_rules)
+ for checker_name, check_rule in self.aicpu_rules.items():
+ if not isinstance(check_rule, (list, dict,)):
+ continue
+
+ if checker_name not in AICPU_CHECKER.keys():
+ logger.warning("Skip %s, which is not support now.", checker_name)
+ continue
+
+ self.aicpu_checker[checker_name] = AICPU_CHECKER[checker_name](check_rule)
+
+ def filter_aicpu_rules(self, aicpu_rules):
+ support_checkers = []
+ for checkers in aicpu_rules['CommonChecker']:
+ for key, value in checkers.items():
+ if key == 'DataTypeChecker' and self.cann_version in value['cann_version']:
+ support_checkers.append(checkers)
+ aicpu_rules['CommonChecker'] = support_checkers
+ return
+
+ def check_aicpu_attr(self, op_info) -> List[str]:
+ suggestions = []
+ for _, checker in self.aicpu_checker.items():
+ suggestions.extend(checker.check(op_info))
+ return suggestions
+
+ def check(self, profiling_data: ProfilingDataset) -> bool:
+ """
+ check if any operator need optimize
+ :param profiling_data: profiling datasest
+ :return: true or false
+ """
+
+ if not self._check_data(profiling_data):
+ return False
+ op_summary = profiling_data.op_summary
+
+ def get_opeartor_stack_info(api_stack_finder: OpStackFinder, op_name_list: list) -> list:
+ data: Dict[str, Dataset] = {}
+ event_dataset = TimelineEventDataset(collection_path=profiling_data.collection_path, data=data, task_type=constant.AI_CPU)
+
+ # disable multiprocessing, avoid cost time of enable new process for light task
+ api_stack_finder.get_api_stack_by_op(event_dataset, op_name_list, constant.AI_CPU,
+ disable_multiprocess=True)
+ return api_stack_finder._stack_record
+
+ self._op_list = []
+ total_task_duration = 0.0
+ max_task_duration = 0.0
+ for op_info in op_summary.op_list:
+ if self._check_operator(op_info):
+ self._op_list.append(op_info)
+
+ task_duration = float(op_info.task_duration)
+ total_task_duration += task_duration
+ max_task_duration = max(max_task_duration, task_duration)
+ if (not self._op_list) or (max_task_duration < self._MIN_TASK_DURATION):
+ return False
+
+ # 获取所有算子堆栈的信息
+ op_name_list = []
+ for op in self._op_list:
+ if op.op_name not in op_name_list:
+ op_name_list.append(op.op_name)
+ api_stack_finder = OpStackFinder()
+ stack_record = get_opeartor_stack_info(api_stack_finder, op_name_list)
+
+ # task_id 到 stack 信息的对应
+ self._op_list.sort(key=lambda x: int(x.task_id))
+ stack_record.sort(key=lambda x: x[0])
+ task_id_to_stack = dict()
+ for stack in stack_record:
+ task_id_to_stack[stack[0]] = stack[-1]
+
+ # 算子追加堆栈属性
+ for op in self._op_list:
+ stack = task_id_to_stack.get(int(op.task_id))
+ op.add_attr(self.STACK_INFO_ITEMS, stack)
+ suggestions = self.check_aicpu_attr(op)
+ op.add_attr(self.SUGGESTION_INFO_ITEMS, suggestions)
+
+ # double 类型算子判断
+ double_type_ai_cpu_operator = []
+ for op in self._op_list:
+ if not op.has_attr("input_data_types"):
+ logger.warning(
+ "Skip checking of input data in AICPU checker because of not containing input_data_dtypes in op summary")
+ break
+ if op.has_attr(
+ "input_data_types") and "DOUBLE" in op.input_data_types and op.op_name not in double_type_ai_cpu_operator:
+ double_type_ai_cpu_operator.append(op.op_name)
+ if bool(double_type_ai_cpu_operator):
+ self._SUGGESTION.append("Try to convert double type operator to float, such as {}".format(
+ ",".join(double_type_ai_cpu_operator)))
+ return True
+
+ def make_render(self, html_render, record):
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="operator_ai_cpu.html",
+ format_result=self.format_operator_result(record, constant.OPERATOR_LIST_UNLIMIT))
+
+ def format_operator_result(self, record, limit):
+ """
+ Format operator result to html
+ :param record: profiling check record
+ :param limit: Limit number of operator statistics lists.
+ :return:
+ """
+ optimization_item = record.optimization_item
+ release_suggestion_list = []
+ for suggestion in optimization_item.suggestion:
+ release_suggestion_list.append(suggestion.replace('\n', '
'))
+ logger.debug("suggestion list is %s", release_suggestion_list)
+ format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list),
+ "task_duration": round(record.statistics_item.task_duration, 2)}
+
+ statistic = self.group_by(copy.deepcopy(self._op_list), op_key='op_type',
+ limit=limit)
+ format_result["statistic"] = statistic
+ stack_key_list = ["stack_info", "input_data_types", "output_data_types"]
+ if statistic:
+ for key, info in statistic:
+ op_info_list = self.group_by_list(info.get("op_info_list"), stack_key_list, limit)
+ info["op_info_list"] = op_info_list
+ return format_result
+
+ def group_by_list(self, op_list, op_key_list: List = ["stack_info", "input_data_types", "output_data_types"],
+ limit: int = constant.OPERATOR_LIST_UNLIMIT):
+ if op_list is None:
+ op_list = []
+
+ # op_key_list 合并添加合并的属性,作为 groupby 的 key value
+ op_key = '+'.join(op_key_list) # str, json
+ for op_info in op_list:
+ attribute = ""
+ for _op in op_key_list:
+ if op_info.get_attr(_op):
+ attribute += op_info.get_attr(_op)
+ op_info.add_attr(op_key, attribute)
+
+ return self.group_by(op_list, op_key=op_key, limit=limit)
+
+
+class BaserChecker:
+ def __init__(self, *args, **kwargs):
+ self.checker_list = []
+
+ def build(self):
+ raise NotImplementedError
+
+ def check(self, op_info) -> List[str]:
+ suggestions = []
+ for checker in self.checker_list:
+ suggestion = checker(op_info)
+ if suggestion is not None:
+ suggestions.append(suggestion)
+ return suggestions
+
+
+class CommonChecker(BaserChecker):
+ def __init__(self, check_rules: List[Dict] = None):
+ super(CommonChecker, self).__init__()
+ self.check_rules = check_rules if check_rules is not None else []
+ self.supported_checker = dict(DataTypeChecker=self.datatype_checker)
+ self.build()
+
+ @staticmethod
+ def datatype_checker(check_item: Dict, op_info) -> Optional[str]:
+ supported_op_type = check_item.get('op_type', [])
+ suggestion = check_item.get('suggestion', "")
+ valid_inputs = check_item.get('input', [])
+ valid_outputs = check_item.get('output', [])
+ ignore_type = check_item.get('ignore_type', [])
+ op_type = getattr(op_info, 'op_type', "UNKNOWN")
+ if "__ALL__" in supported_op_type or \
+ op_type.lower() in supported_op_type:
+ if op_type.lower() in ignore_type:
+ return None
+
+ op_input_dtype = getattr(op_info, 'input_data_types', "").split(";")
+ op_input_dtype = [item.lower() for item in op_input_dtype]
+ op_output_dtype = getattr(op_info, 'output_data_types', "").split(";")
+ op_output_dtype = [item.lower() for item in op_output_dtype]
+ input_dtype_diff = set(op_input_dtype).difference(set(valid_inputs))
+ output_dtype_diff = set(op_output_dtype).difference(set(valid_outputs))
+ unsupported_dtype_diff = input_dtype_diff.union(output_dtype_diff)
+ if not unsupported_dtype_diff:
+ return None
+
+ return suggestion.format(",".join(unsupported_dtype_diff).upper(),
+ op_type,
+ ",".join(valid_inputs).upper())
+
+ def build(self):
+ for check in self.check_rules:
+ (check_func, check_rule), = check.items()
+ if check_func not in self.supported_checker:
+ logger.warning("Skip %s, which has not been implemented.", check_func)
+ continue
+ self.checker_list.append(partial(self.supported_checker.get(check_func), check_rule))
+
+
+class ExampleGuideChecker(BaserChecker):
+ def __init__(self, check_rules: List[Dict] = None):
+ super(ExampleGuideChecker, self).__init__()
+ self.check_rules = check_rules if check_rules is not None else []
+ self.build()
+
+ def build(self):
+ def _guide_url(check_item: Dict, op_info) -> Optional[str]:
+ supported_op_type = check_item.get('op_type', [])
+ url = check_item.get('url', "")
+ suggestion = check_item.get('suggestion', "")
+
+ if getattr(op_info, 'op_type', "UNKNOWN").lower() in supported_op_type:
+ return suggestion if "{}" not in suggestion else suggestion.format(url)
+
+ for check in self.check_rules:
+ (_, check_rule), = check.items()
+ self.checker_list.append(partial(_guide_url, check_rule))
+
+
+AICPU_CHECKER = {
+ "CommonChecker": CommonChecker,
+ "ExampleGuideChecker": ExampleGuideChecker
+}
diff --git a/profiler/advisor_review/analyzer/computation/bound/__init__.py b/profiler/advisor_review/analyzer/computation/bound/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d7ddd93c70e59dc0d10318fdac06fdc581f70c
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py
@@ -0,0 +1,75 @@
+import logging
+
+from typing import List
+
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.common import constant
+from profiler.advisor.config.config import Config
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+
+logger = logging.getLogger()
+
+
+class BlockDimChecker(OperatorChecker):
+ _SUGGESTION: List[str] = []
+ _CHECKER = "block dim"
+ _PROBLEM = "block dim"
+ _description = "some operator does not make full use of {} ai core"
+ _ITEMS = [
+ "op_name", "op_type", "task_type", "task_duration", "income", "block_dim", "mix_block_dim", "input_shapes",
+ "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats"
+ ]
+
+ def pre_check(self, profiling_data) -> bool:
+ return not self.is_dynamic_shape(profiling_data)
+
+ def _check_data(self, data):
+ self.format_suggestion_content(data)
+ if not self._check_summary(data):
+ return False
+ if not Config().get_config("ai_core_num"):
+ logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ai core num in info.json file")
+ return False
+ summary = data.op_summary
+ op_info = summary.op_list[0]
+ if not hasattr(op_info, "block_dim"):
+ logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "block dim in op summary")
+ return False
+ if Config().get_config("ai_core_num"):
+ self._aicore_num = int(Config().get_config("ai_core_num"))
+ if Config().get_config("aiv_num"):
+ self._aiv_num = int(Config().get_config("aiv_num"))
+ self._description = self._description.format(self._aicore_num)
+ if self._aiv_num:
+ self._description += f" or {self._aiv_num} ai vector core"
+ self._description += f";\n Top-{OperatorChecker._MAX_TUNE_OP_NUM} operator of " \
+ "task duration are as follows:\n"
+ return True
+
+ def make_render(self, html_render, record):
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="operator_block_dim.html",
+ format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK))
+
+ def _check_operator(self, op_info) -> bool:
+ if op_info.task_type not in ["AI_CORE", "AI_VECTOR_CORE", "MIX_AIC"]:
+ return False
+ block_dim = int(op_info.block_dim)
+ core_num = self.get_core_num(op_info)
+ if block_dim % core_num == 0:
+ return False
+ if op_info.task_type == "MIX_AIC" and hasattr(op_info, "mix_block_dim") \
+ and self._aiv_num and int(op_info.mix_block_dim) % self._aiv_num == 0:
+ return False
+ return True
+
+ def get_core_num(self, op_info):
+ """
+ get core num of task type
+ """
+ if op_info.task_type == "AI_CORE" or not self._aiv_num:
+ core_num = self._aicore_num
+ else:
+ core_num = self._aiv_num
+ return core_num
diff --git a/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..a22b380f974b14207d6d7be262cd49f0ba0fbe99
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py
@@ -0,0 +1,53 @@
+import logging
+from typing import List
+
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.common import constant
+from profiler.advisor.config.config import Config
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.utils.utils import to_percent
+
+logger = logging.getLogger()
+
+
+class OperatorBoundChecker(OperatorChecker):
+ _MIN_TASK_DURATION = 20 # min task duration 20us
+ _CHECKER = "operator no bound"
+ _PROBLEM = "operator no bound"
+ _SUGGESTION: List[str] = []
+ _description = (
+ f"There is no mte, cube, vector, scalar ratio is more than {to_percent(Config().operator_bound_ratio)};\n" +
+ f"Top task duration operators need to be tuned are as follows: \n")
+ _ITEMS = [
+ "op_name", "op_type", "task_type", "task_duration", "vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio",
+ "mte2_ratio", "mte3_ratio", "block_dim", "input_shapes", "input_data_types", "input_formats", "output_shapes",
+ "output_data_types", "output_formats"
+ ]
+
+ def pre_check(self, profiling_data) -> bool:
+ return not self.is_dynamic_shape(profiling_data)
+
+ def _check_data(self, data):
+ self.format_suggestion_content(data)
+ if not self._check_summary(data):
+ return False
+ for op_info in data.op_summary.op_list:
+ return self._check_operator(op_info)
+
+ logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ratio in op summary")
+ return False
+
+ def _check_operator(self, op_info) -> bool:
+ bound_list = ["vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio", "mte2_ratio", "mte3_ratio"]
+ ratio_list = [self.get_ratio(op_info, attr) for attr in bound_list]
+ if not any(ratio_list):
+ return False # no data, skip check
+ if any(ratio and ratio > Config().operator_bound_ratio for ratio in ratio_list):
+ return False
+ return True
+
+ def make_render(self, html_render, record):
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="operator_no_bound.html",
+ format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK))
diff --git a/profiler/advisor_review/analyzer/computation/op_compile/__init__.py b/profiler/advisor_review/analyzer/computation/op_compile/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d3bac4ff8cb163d23a6365307b855839b12a6a
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py
@@ -0,0 +1,65 @@
+import copy
+import logging
+from typing import List
+
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.common import constant
+from profiler.advisor.dataset.profiling.info_collection import OpInfo
+from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord
+
+logger = logging.getLogger()
+
+
+class DynamicShapeChecker(OperatorChecker):
+ ENABLE_COMPILED_SUGGESTION = "Optimize by enabling compiled operator, such as:\n" \
+ "`torch_npu.npu.set_compile_mode(jit_compile=False)`\n"
+ _SUGGESTION: List[str] = [ENABLE_COMPILED_SUGGESTION]
+ _CHECKER = "dynamic shape operator"
+ _PROBLEM = "Dynamic shape operator"
+ _description = f"Found all operators are dynamic shape"
+ _op_list: List[OpInfo] = []
+ _tune_op_list: List[str] = [] # record op name to be tuned, and save to tune_ops_file.cfg
+ _op_views: List = []
+
+ def __init__(self, cann_version) -> None:
+ super().__init__(cann_version=cann_version)
+
+ def check(self, profiling_database) -> bool:
+ return self.is_dynamic_shape(profiling_database)
+
+ def make_record(self, profiling_database) -> OptimizeRecord:
+ """
+ make record for what and how to optimize
+ """
+
+ optimization_item = OptimizeItem(
+ self._PROBLEM,
+ self._description,
+ self._SUGGESTION
+ )
+ statistics_item = StatisticsItem("", "", 1)
+ return OptimizeRecord(optimization_item, statistics_item)
+
+ def format_operator_result(self, record, limit=-1):
+ """
+ Format operator result to html
+ :param record: profiling check record
+ :param limit: Limit number of operator statistics lists.
+ :return:
+ """
+ optimization_item = record.optimization_item
+ release_suggestion_list = []
+ for suggestion in optimization_item.suggestion:
+ release_suggestion = copy.deepcopy(suggestion)
+ if release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION:
+ release_suggestion += \
+ f"for details please refer to link : LINK"
+ release_suggestion_list.append(release_suggestion.replace('\n', '
'))
+ format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)}
+ return format_result
+
+ def make_render(self, html_render, record):
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="operator_dynamic_shape.html",
+ format_result=self.format_operator_result(record))
diff --git a/profiler/advisor_review/analyzer/computation/operator_checker.py b/profiler/advisor_review/analyzer/computation/operator_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f47650943a7355b494bd766214d10526c46c0fa
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/operator_checker.py
@@ -0,0 +1,307 @@
+import copy
+import logging
+from textwrap import fill
+from typing import List
+
+from profiler.advisor.common import constant
+from profiler.advisor.common.version_control import VersionControl
+from profiler.advisor.config.config import Config
+from profiler.advisor.dataset.profiling.info_collection import OpInfo
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord
+from profiler.advisor.utils.utils import safe_division
+
+logger = logging.getLogger()
+
+
+class OperatorChecker(VersionControl):
+ _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION
+ _MAX_TUNE_OP_NUM = constant.OPERATOR_OUT_TOPK
+ _MIN_TASK_DURATION = 0
+ _MIN_TASK_DURATION_RATIO = 1.0
+ _MIN_TOTAL_DURATION_RATIO = 1.0
+ _CHECKER = str()
+ _PROBLEM = str()
+ _description = str()
+ STACK_INFO_ITEMS = ""
+ _ITEMS: List[str] = []
+ _SUGGESTION: List[str] = []
+ SKIP_CHECK_MSG = "Skip %s checker because of not containing %s"
+ _tune_op_info_list: List[OpInfo] = []
+ PyTorch_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE, such as:\n" \
+ f"'aoe --job_type=2 --model_path=$user_dump_path " \
+ f"--tune_ops_file={Config().tune_ops_file}'\n"
+ MSLite_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE in mindspore lite framework, such as:\n" \
+ f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \
+ f"--modelFile=$user_model.onnx --outputFile=user_model --configFile=./config.txt\n"
+ _tune_op_list: List[str] = []
+
+ def __init__(self, cann_version: str):
+ self.cann_version = cann_version
+ self._op_list: List[OpInfo] = []
+
+ def check(self, profiling_data: ProfilingDataset) -> bool:
+ """
+ check if any operator need optimize
+ :param profiling_data: profiling datasest
+ :return: true or false
+ """
+ if not self._check_data(profiling_data):
+ return False
+
+ summary = profiling_data.op_summary
+ total_task_duration = 0.0
+ max_task_duration = 0.0
+ for op_info in summary.op_list:
+ if not self._check_operator(op_info):
+ continue
+ task_duration = float(op_info.task_duration)
+ total_task_duration += task_duration
+ max_task_duration = max(max_task_duration, task_duration)
+ self._op_list.append(op_info)
+ if task_duration > self._MIN_TASK_DURATION:
+ self._tune_op_info_list.append(op_info)
+
+ if any([
+ max_task_duration > self._MIN_TASK_DURATION,
+ round(safe_division(max_task_duration, summary.get_total_task_duration()),
+ 4) > self._MIN_TASK_DURATION_RATIO,
+ round(safe_division(total_task_duration, summary.get_total_task_duration()), 4) >
+ self._MIN_TOTAL_DURATION_RATIO,
+ ]):
+ self._op_list.sort(key=lambda x: float(x.get_attr("task_duration")), reverse=True)
+ self._tune_op_info_list.sort(key=lambda x: float(x.get_attr("task_duration")), reverse=True)
+ for op in self._op_list:
+ if op.op_name not in self._tune_op_list and len(self._tune_op_list) < constant.OPERATOR_OUT_TOPK:
+ self._tune_op_list.append(op.op_name)
+ return True
+ return False
+
+ def make_record(self, profiling_data: ProfilingDataset):
+ """
+ Make record for what and how to optimize
+ :param profiling_data: profiling data
+ :return: optimize record
+ """
+ task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list if
+ hasattr(op_info, "get_attr")]
+ total_cost_time = sum(task_duration_list)
+ total_task_duration = profiling_data.op_summary.get_total_task_duration()
+ count = len(task_duration_list)
+ statistics_item = StatisticsItem(total_task_duration, total_cost_time, count, self.get_incomes())
+ optimization_item = OptimizeItem(
+ self._PROBLEM,
+ self._get_description(self._description, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]),
+ self._SUGGESTION
+ )
+ return OptimizeRecord(optimization_item, statistics_item)
+
+ def _get_description(self, description, op_type_list=None):
+ if not op_type_list:
+ return description
+
+ desc_suffix = []
+ for i in range(len(op_type_list)):
+ if i % 3 == 0 and i != 0:
+ desc_suffix.append("\n")
+
+ desc_suffix.append(f"{op_type_list[i]}")
+
+ if i < len(op_type_list) - 1:
+ desc_suffix.append(", ")
+
+ description += "".join(desc_suffix)
+ return description
+
+ def pre_check(self, profiling_data) -> bool:
+ return True
+
+ def is_dynamic_shape(self, profiling_database: ProfilingDataset) -> bool:
+ less_than_cann800_list = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15]
+ # CANN 8.0.0 之前从 ge_info 中获取 op_state 属性,进行动态 shape 逻辑判断
+ if self.cann_version in less_than_cann800_list:
+ if hasattr(profiling_database, "ge_info"):
+ ge_info = profiling_database.ge_info
+ static_shape_operators = ge_info.get_static_shape_operators()
+ if len(static_shape_operators) == 0:
+ return True
+ else:
+ logger.warning(
+ "Skip dynamic shape check because of not containing ge_info.db file in host filefloder.\n"
+ "To enable dynamic shape check, please try to set data_simplification=False in experimental_config.\n"
+ "More details please refer to link : %s", constant.ASCEND_PROFILER_URL)
+ else:
+ # CANN 8.0.0 之后 op_state 属性从 op_summary 文件中获取
+ if hasattr(profiling_database, "op_summary"):
+ static_shape_operators = profiling_database.op_summary.get_static_shape_operators()
+ if len(static_shape_operators) == 0:
+ return True
+ else:
+ logger.warning(
+ "Skip dynamic shape check because of not containing op_summary.csv file in current filefloder."
+ )
+ return False
+
+ def format_operator_result(self, record, limit):
+ """
+ Format operator result to html
+ :param record: profiling check record
+ :param limit: Limit number of operator statistics lists.
+ :return:
+ """
+ optimization_item = record.optimization_item
+ release_suggestion_list = []
+ for suggestion in optimization_item.suggestion:
+ release_suggestion = copy.deepcopy(suggestion)
+ if release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION:
+ release_suggestion += \
+ (f"for details please refer to link : LINK")
+ elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION:
+ release_suggestion += \
+ (f"\nThe config file for MSLite AOE usage is as follows:\n" \
+ f"[ascend_context]\n" \
+ f"aoe_mode=\"operator tuning\"\n" \
+ f"--tune_ops_file={Config().tune_ops_file}\n"
+ f"\nFor details please refer to link : LINK")
+ release_suggestion_list.append(release_suggestion.replace('\n', '
'))
+ format_result = {"record": record.__dict__,
+ "suggestion": fill('
'.join(release_suggestion_list), width=200),
+ "task_duration": round(record.statistics_item.task_duration, 2)}
+ statistic = self.group_by(copy.deepcopy(self._op_list), limit=limit)
+ format_result["statistic"] = statistic
+ return format_result
+
+ def group_by(self, op_list, op_key="op_type",
+ limit: int = constant.OPERATOR_LIST_UNLIMIT):
+ """
+ group by Profiling.OpInfo's attribute key, then return top limit tuple by duration
+ :param op_list: input a OpInfo list
+ :param op_key: group by Profiling.OpInfo's attribute key
+ :param limit: top limit num, if you do not need to limit the length of tuple, input -1(int)
+ :return:
+ """
+ if op_list is None:
+ op_list = []
+ statistic = {} # str, json
+ for op_info in op_list:
+ if statistic.get(op_info.get_attr(op_key)):
+ statistic[op_info.get_attr(op_key)]["summary"]["total_duration"] = float(
+ statistic[op_info.get_attr(op_key)]["summary"]["total_duration"]) + float(
+ op_info.get_attr("task_duration", constant.DEFAULT_DURATION_ZERO))
+ statistic[op_info.get_attr(op_key)]["summary"]["counts"] += 1
+ stack_info = op_info.get_attr("stack_info")
+ if stack_info:
+ op_info.stack_info = stack_info.replace('\r\n', '
')
+ statistic[op_info.get_attr(op_key)]["op_info_list"].append(op_info)
+ else:
+ statistic[op_info.get_attr(op_key)] = {"summary": {}, "op_info_list": []}
+ statistic[op_info.get_attr(op_key)]["summary"]["op_type"] = op_info.get_attr(
+ "op_type", constant.DEFAULT_OPERATOR_TYPE)
+ statistic[op_info.get_attr(op_key)]["summary"]["total_duration"] = float(
+ op_info.get_attr("task_duration", constant.DEFAULT_DURATION_ZERO))
+ statistic[op_info.get_attr(op_key)]["summary"]["counts"] = 1
+ stack_info = op_info.get_attr("stack_info")
+ if stack_info:
+ op_info.stack_info = stack_info.replace('\r\n', '
')
+ statistic[op_info.get_attr(op_key)]["op_info_list"] = [op_info]
+
+ if statistic:
+ for op_key in statistic.keys():
+ statistic[op_key]["summary"]["total_duration"] = round(
+ statistic[op_key]["summary"]["total_duration"], 2)
+ # Grouped by op_type, sorted by total_duration, and obtained the top 10 operators that take the most time.
+ if limit > 0:
+ statistic = sorted(
+ statistic.items(), key=lambda kv: kv[1]["summary"]["total_duration"], reverse=True)[:limit]
+ else:
+ statistic = sorted(statistic.items(), key=lambda kv: kv[1]["summary"]["total_duration"], reverse=True)
+ else:
+ logger.warning("%s checker do not has results to format html", str(self.__class__.__name__))
+ return statistic
+
+ def _check_data(self, profiling_data):
+ return True
+
+ def _check_operator(self, op_info):
+ return False
+
+ def _get_income(self, _op_info: OpInfo) -> float:
+ return 0
+
+ def get_tune_op_list(self):
+ """
+ get tune op list
+ :return: tune op list
+ """
+ return self._tune_op_list
+
+ def get_views(self, _graph_data):
+ """Get node views."""
+ return []
+
+ @classmethod
+ def get_name(cls):
+ """
+ get name of checker
+ :return: checker name
+ """
+ return cls._PROBLEM
+
+ def get_incomes(self) -> float:
+ """get incomes"""
+ incomes = 0.0
+ for op_info in self._op_list:
+ income = self._get_income(op_info)
+ setattr(op_info, "income", round(income, 2))
+ incomes += income
+ return incomes
+
+ def get_op_type_list(self, op_list: List[OpInfo]):
+ """get op type list"""
+ op_type_list = []
+ for op_info in op_list:
+ if op_info.op_type not in op_type_list:
+ op_type_list.append(op_info.op_type)
+ return op_type_list
+
+ def _check_summary(self, data: ProfilingDataset):
+ if not hasattr(data, "op_summary"):
+ logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "op summary")
+ return False
+ return True
+
+ @staticmethod
+ def get_ratio(op_info: OpInfo, attr: str) -> float:
+ if not op_info.has_attr(attr):
+ return 0
+ value = op_info.get_attr(attr)
+ if not value or value == "N/A":
+ return 0
+ return float(value)
+
+ def get_details(self) -> list:
+ """
+ get details of operator to be optimized
+ :return: detail list
+ """
+ op_list = self._op_list
+ if not op_list or not (self._ITEMS + [self.STACK_INFO_ITEMS]):
+ return []
+ details = []
+ attrs = [attr for attr in (self._ITEMS + [self.STACK_INFO_ITEMS]) if op_list[0].has_attr(attr)]
+ details.append(attrs)
+ op_list = sorted(op_list, key=lambda x: float(x.get_attr("task_duration")), reverse=True)
+ for op_info in op_list:
+ content = [
+ op_info.get_attr(attr) if attr != "aicore_time"
+ else op_info.get_float_attr(attr, strict_mode=True) +
+ op_info.get_float_attr("aiv_time", strict_mode=True) for attr in attrs
+ ]
+ details.append(content)
+ return details
+
+ def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None:
+ if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER:
+ self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION)
+ elif profiling_data.PROF_TYPE == constant.MSLITE:
+ self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION)
diff --git a/profiler/advisor_review/analyzer/computation/profiling_analyzer.py b/profiler/advisor_review/analyzer/computation/profiling_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8682617700702055628a31982b0eafab9feb336d
--- /dev/null
+++ b/profiler/advisor_review/analyzer/computation/profiling_analyzer.py
@@ -0,0 +1,89 @@
+import logging
+from abc import ABC
+from typing import Dict, List
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.common import constant
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.analyzer.computation.aicpu.aicpu_checker import AicpuChecker
+from profiler.advisor.analyzer.computation.bound.block_dim_checker import BlockDimChecker
+from profiler.advisor.analyzer.computation.bound.operator_bound_checker import OperatorBoundChecker
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.analyzer.computation.op_compile.dynamic_shape_checker import DynamicShapeChecker
+from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.utils.utils import get_supported_subclass
+
+logger = logging.getLogger()
+
+
+class ProfilingAnalyzer(BaseAnalyzer, ABC):
+ dataset_cls_list = [ProfilingDataset]
+
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = OperatorChecker(self.cann_version)
+ self.html_render = HTMLRender()
+ self.result = OptimizeResult()
+
+ @BaseAnalyzer.check_data((ProfilingDataset.get_key(),))
+ def optimize(self, **kwargs) -> OptimizeResult:
+ """
+ optimize operator
+ :param data: input datasets
+ :return: result
+ """
+ profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key())
+ checker = self.checker
+ if not checker.pre_check(profiling_data):
+ return self.result
+ if checker.check(profiling_data):
+ # add record
+ record = checker.make_record(profiling_data)
+ checker.make_render(self.html_render, record)
+ self.result.add(record)
+ # add details
+ details = checker.get_details()
+ if details:
+ for i, detail in enumerate(details):
+ if i == 0:
+ # the first row is header
+ self.result.add_detail(checker.get_name(), headers=detail)
+ else:
+ self.result.add_detail(checker.get_name(), detail=detail)
+ # add tune op list
+ tune_op_list = checker.get_tune_op_list()
+ if tune_op_list:
+ self.result.add_tune_op_list(tune_op_list)
+
+ return self.result
+
+ def make_record(self):
+ pass
+
+ def make_render(self):
+ pass
+
+
+class DynamicShapeAnalyzer(ProfilingAnalyzer):
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = DynamicShapeChecker(self.cann_version)
+
+
+class BlockDimAnalyzer(ProfilingAnalyzer):
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = BlockDimChecker(self.cann_version)
+
+
+class OperatorBoundAnalyzer(ProfilingAnalyzer):
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = OperatorBoundChecker(self.cann_version)
+
+class AicpuAnalyzer(ProfilingAnalyzer):
+ def __init__(self, collection_path, **kwargs) -> None:
+ super().__init__(collection_path, **kwargs)
+ self.checker = AicpuChecker(self.cann_version)
\ No newline at end of file
diff --git a/profiler/advisor_review/analyzer/dataloader/__init__.py b/profiler/advisor_review/analyzer/dataloader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/graph_fusion/__init__.py b/profiler/advisor_review/analyzer/graph_fusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..326be83b8d49088b1563ccd8c08b68a4aa3001ef
--- /dev/null
+++ b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py
@@ -0,0 +1,49 @@
+from typing import List
+from functools import partial
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.dataset.graph_dataset import GraphDataset
+from profiler.advisor.analyzer.graph_fusion.graph_fusion_checker import GraphFusionRules
+from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset
+from profiler.advisor.display.html.render import HTMLRender
+
+
+class FusionOPAnalyzer(BaseAnalyzer):
+ """
+ fusion optimizer
+ """
+ RULES = dict(graph_dataset=partial(GraphFusionRules, "rules/op_fusion_pass.yaml"))
+ dataset_cls_list = [GraphDataset, ProfilingDataset]
+
+ def __init__(self, collection_path, **kwargs) -> None:
+ super(FusionOPAnalyzer, self).__init__(collection_path, **kwargs)
+ self.result = OptimizeResult()
+ self.html_render = HTMLRender()
+
+ @BaseAnalyzer.check_data((GraphDataset.get_key(),))
+ def optimize(self, **kwargs):
+ """
+ :return: result
+ """
+ self._check(self.dataset_list.get("GraphDataset"), self.dataset_list.get("ProfilingDataset"))
+ return self.result
+
+ def _check(self, graph_data: List[GraphDataset],
+ profiling_data: List[ProfilingDataset] = None) -> None:
+ if len(graph_data) == 0 or graph_data[0].is_empty():
+ return
+ for _, rule in self.RULES.items():
+ checker = rule()
+ if profiling_data is None:
+ checker.find_fusion_matched_issues(graph_data)
+ else:
+ checker.find_fusion_matched_issues_with_times(graph_data, profiling_data)
+ checker.make_record(self.result)
+ checker.make_render(self.html_render)
+
+ def make_record(self):
+ pass
+
+ def make_render(self):
+ pass
diff --git a/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..e64020fdfe2ace37172e82ed562db1b66971d3d6
--- /dev/null
+++ b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py
@@ -0,0 +1,207 @@
+import logging
+from typing import List
+
+from tqdm import tqdm
+
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord, StatisticsItem
+from profiler.advisor.common.graph.graph import Graph
+from profiler.advisor.common.graph.graph_parser import QueryGraphParser
+from profiler.advisor.dataset.graph_dataset import GraphDataset
+from profiler.advisor.common.graph.graph_match import find_isomorphisms
+
+logger = logging.getLogger()
+
+
+class GraphFusionRules:
+ def __init__(self, fusion_rules: str):
+ self.fusion_rules = fusion_rules
+ self.candidates = []
+ self.task_duration_list = []
+
+ @staticmethod
+ def build_query_graph(query_graphs) -> List[Graph]:
+ for _, query_graph in query_graphs.fusion_rules.items():
+ for sub_graph in query_graph:
+ graph = Graph(*sub_graph)
+ graph.build()
+ yield graph
+
+ def find_fusion_matched_issues(self, graphs: List[GraphDataset]):
+ query_graphs = QueryGraphParser(self.fusion_rules)
+ with tqdm(total=query_graphs.num_rules, leave=False, ncols=100, unit=" rules") as pbar:
+ pbar.set_description(f"Searching Isomorphic Subgraph")
+ for query_graph in self.build_query_graph(query_graphs):
+ query_candidates = find_isomorphisms(query_graph.graph, graphs[0].graphs[-1].graph)
+ pbar.update(1)
+ if len(query_candidates) > 0:
+ self.candidates.append(query_candidates)
+
+ def find_fusion_matched_issues_with_times(self, graphs: List[GraphDataset], profiling):
+ self.find_fusion_matched_issues(graphs)
+ if len(self.candidates) == 0 or len(profiling) == 0:
+ return
+
+ if not hasattr(profiling[0], 'op_summary') or profiling[0].op_summary is None:
+ if hasattr(profiling[0], 'msprof'):
+ self.match_time_from_msprof(profiling[0].msprof)
+ return
+ else:
+ logger.warning("Skip analyze operator because of not containing op summary.")
+ return
+
+ self.match_time_from_summary(profiling[0].op_summary)
+ time_duration_sum = []
+ for task_duration in self.task_duration_list:
+ time_duration_sum.append(sum([sum(duration) for duration in task_duration]))
+ time_duration_index = sorted(range(len(time_duration_sum)),
+ key=time_duration_sum.__getitem__,
+ reverse=True)
+ self.task_duration_list = [self.task_duration_list[i] for i in time_duration_index]
+ self.candidates = [self.candidates[i] for i in time_duration_index]
+
+ def match_time_from_summary(self, op_summary):
+ op_dict = op_summary.task_dict
+ for candidates in self.candidates:
+ candidate_duration = []
+ for candidate in candidates:
+ duration_list = []
+ for node in candidate.values():
+ if node.op_name not in op_dict or op_dict[node.op_name][0].op_type.lower() != node.op_type.lower():
+ logger.warning("Operator %s is missing in op summary, which will be set to 0.", node.op_name)
+ duration_list.append(0.0)
+ continue
+ duration_list.append(float(op_dict[node.op_name][0].task_duration))
+ candidate_duration.append(duration_list)
+ self.task_duration_list.append(candidate_duration)
+
+ def match_time_from_msprof(self, msprof):
+ op_dict = dict()
+ for task in msprof.tasks:
+ if "item_id" not in task.args:
+ continue
+ op_dict[task.args["item_id"]] = {"task_duration": task.dur}
+ for candidates in self.candidates:
+ candidate_duration = []
+ for candidate in candidates:
+ duration_list = []
+ for node in candidate.values():
+ if node.op_name not in op_dict:
+ logger.warning("Operator %s is missing in msprof, which will be set to 0.", node.op_name)
+ duration_list.append(0.0)
+ continue
+ duration_list.append(float(op_dict[node.op_name].get("task_duration")))
+ candidate_duration.append(duration_list)
+ self.task_duration_list.append(candidate_duration)
+
+ def make_render(self, html_render):
+ if not self.candidates:
+ return
+
+ candidates_list = []
+ for case_id, nodes in enumerate(self.candidates):
+ candidate_dict = dict()
+ candidate_dict['counts'] = len(nodes)
+ candidate_dict['matches'] = []
+ has_time_info = False
+ if self.task_duration_list:
+ has_time_info = True
+ candidate_dict['total_duration'] = round(sum(sum(duration) for duration in
+ self.task_duration_list[case_id]), 2)
+ for node_index, refer_node in enumerate(nodes):
+ match = []
+ index = 0
+ pass_name = ','.join(item.op_type for item in refer_node.keys())
+ for query_node, host_node in refer_node.items():
+ fusion_pattern = query_node.op_pass
+
+ if 'op_pass' not in candidate_dict:
+ candidate_dict['op_pass'] = fusion_pattern
+ if 'fusion_pattern' not in candidate_dict:
+ candidate_dict['fusion_pattern'] = pass_name
+ match_attr = dict()
+ match_attr['op_name'] = host_node.op_name
+ match_attr['dtype'] = query_node.op_type
+ if has_time_info:
+ match_attr['duration'] = round(self.task_duration_list[case_id][node_index][index], 2)
+ index += 1
+ match.append(match_attr)
+ match_attr = dict()
+ match_attr['op_name'] = "-"
+ match_attr['dtype'] = "-"
+ if has_time_info:
+ match_attr['duration'] = round(sum(self.task_duration_list[case_id][node_index]), 2)
+ match.append(match_attr)
+ candidate_dict['matches'].append(match)
+ candidates_list.append(candidate_dict)
+ html_render.render_template(key="computation",
+ template_dir="templates",
+ template_name="fusion.html",
+ candidates=candidates_list)
+
+ def make_record(self, result: OptimizeResult):
+ """
+ make record for what and how to optimize
+ """
+ if not self.candidates:
+ return
+
+ optimization_item = OptimizeItem(
+ "fusion issue",
+ f"Found {len(self.candidates)} fusion issues",
+ ["Check fusion issues detail in att_advisor*.html"]
+ )
+ total_time = 0.0
+ for candidate in self.task_duration_list:
+ for duration in candidate:
+ total_time += sum(duration)
+ statistics_item = StatisticsItem(0,
+ total_time,
+ sum([len(candidate) for candidate in self.candidates])
+ )
+ result.add(OptimizeRecord(optimization_item, statistics_item))
+
+ record_title = [
+ "issue_id", "graph_name", "op_name", "fusion_structure", "fusion_pattern",
+ "op_type", "input_shape", "input_format",
+ "input_dtype", "output_shape", "output_format", "output_dtype"
+ ]
+ result.add_detail('fusion issues', headers=record_title)
+
+ for case_id, nodes in enumerate(self.candidates):
+ for _, refer_node in enumerate(nodes):
+ pass_name = ','.join(item.op_type for item in refer_node.keys())
+ for query_node, host_node in refer_node.items():
+ fusion_pattern = query_node.op_pass
+ detail = [
+ case_id,
+ host_node.graph_name,
+ host_node.op_name,
+ pass_name,
+ fusion_pattern,
+ query_node.op_type,
+ self.get_attr_shape(host_node, "input", "shape"),
+ self.get_attr_type(host_node, "input", "format"),
+ self.get_attr_type(host_node, "input", "dtype"),
+ self.get_attr_shape(host_node, "output", "shape"),
+ self.get_attr_type(host_node, "output", "format"),
+ self.get_attr_type(host_node, "output", "dtype"),
+ ]
+ result.add_detail('fusion issues', detail=detail)
+
+ @staticmethod
+ def get_attr_shape(node, type_name: str, attr_name: str) -> str:
+ attr_shape = []
+ node_attrs = getattr(node, type_name, [])
+ for attrs in node_attrs:
+ attr = getattr(attrs, attr_name, [])
+ attr_shape.append(",".join(attr))
+ return ";".join(attr_shape)
+
+ @staticmethod
+ def get_attr_type(node, type_name: str, attr_name: str) -> str:
+ attr_type = []
+ node_attrs = getattr(node, type_name, [])
+ for attr in node_attrs:
+ attr_type.append(getattr(attr, attr_name, ""))
+ return ";".join(attr_type)
diff --git a/profiler/advisor_review/analyzer/overall/__init__.py b/profiler/advisor_review/analyzer/overall/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/overall/overall_analyzer.py b/profiler/advisor_review/analyzer/overall/overall_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..916a396b3d096dc788954cbc8e8ba9755cd15f4e
--- /dev/null
+++ b/profiler/advisor_review/analyzer/overall/overall_analyzer.py
@@ -0,0 +1,45 @@
+import logging
+from typing import Dict, List
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.result.result import OptimizeResult
+from profiler.compare_tools.compare_backend.utils.constant import Constant
+from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface
+
+logger = logging.getLogger()
+
+
+class OverallSummaryAnalyzer(BaseAnalyzer):
+
+ def __init__(self, profiling_path, benchmark_profiling_path=None, **kwargs):
+ self.benchmark_profiling_path = benchmark_profiling_path or profiling_path
+ self.profiling_path = profiling_path
+ self.html_render = HTMLRender()
+ self.result = OptimizeResult()
+
+ def optimize(self, **kwargs):
+ compare_result = ComparisonInterface(self.benchmark_profiling_path, self.profiling_path).compare(
+ Constant.OVERALL_COMPARE)
+
+ headers = compare_result.get('Model Profiling Time Distribution').get("headers", [])
+ rows = compare_result.get('Model Profiling Time Distribution').get("rows", [])
+
+ self.make_record()
+ self.make_render(headers=headers, rows=rows)
+ return compare_result
+
+ def make_record(self):
+ pass
+
+ def make_render(self, **kwargs):
+ headers = kwargs.get("headers")
+ rows = kwargs.get("rows")
+
+ if not headers or not rows:
+ logger.info("Empty headers or rows, skip render overall analysis html")
+ self.html_render.render_template(key="overall",
+ template_dir="templates",
+ template_name="overall_analysis.html",
+ headers=kwargs.get("headers"),
+ rows=kwargs.get("rows"))
diff --git a/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c74ae0510331fb9ba8a1794bd724710ba19cfabf
--- /dev/null
+++ b/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import copy
+
+import logging
+from typing import Dict, List
+
+from profiler.advisor.display.html.render import HTMLRender
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.compare_tools.compare_backend.utils.constant import Constant
+from profiler.advisor.common import constant as const
+from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface
+from profiler.advisor.utils.utils import get_file_path_from_directory, load_parameter
+
+
+class OverallSummaryAnalyzer(BaseAnalyzer):
+ OVERALL_SUMMARY_ANALYZER = "overall_summary_analysis"
+ advice_map = {
+ "Computing Time": "if you want more detailed advice please go to att_advisor_*.html",
+ "Uncovered Communication Time": "if you want more detailed advice please go to att_advisor_*.html",
+ "Free Time": "if you want more detailed advice please go to att_advisor_*.html"
+ }
+ time_name_map = {
+ "Computing Time": "computing",
+ "Uncovered Communication Time": "communication",
+ "Free Time": "free",
+ 'Cube Time(Num)': 'Cube Time',
+ 'Vector Time(Num)': 'Vector Time',
+ 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)',
+ 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)',
+ 'Other Time': "Other Computing Time",
+ 'SDMA Time(Num)': 'SDMA Time'
+ }
+ performance_time_dict = {
+ "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)',
+ 'Flash Attention Time(Backward)(Num)', 'Other Time'],
+ "Uncovered Communication Time(Wait Time)": [],
+ "Free Time": ['SDMA Time(Num)']
+ }
+
+ def __init__(self, collection_path: str, n_processes: int = 1, **kwargs):
+ profile_path = get_profile_path(collection_path)
+ super().__init__(profile_path, n_processes, **kwargs)
+ self.base_collection_path = kwargs.get("base_collection_path", "")
+ self._has_base_collection = False
+ self._is_minimal_profiling = False
+ self.cur_data = {}
+ self.cur_data_table = {}
+ self.cur_bottleneck = {}
+ self.cur_advices = ""
+ self._headers = []
+ self._base_data = []
+ self._comparison_data = []
+ self.html_render = HTMLRender()
+ self.result = OptimizeResult()
+ self.bottleneck_str = ""
+ self.bottleneck_table = {}
+
+ @staticmethod
+ def split_duration_and_num(time_value: str) -> tuple:
+ split_data = time_value.split("s") # time value example: 0.229s(1756)
+ duration, num = 0.0, None
+ if len(split_data) >= 2:
+ try:
+ num = int(split_data[1].strip("()"))
+ except ValueError:
+ pass
+ if len(split_data) >= 1:
+ try:
+ duration = float(split_data[0])
+ except ValueError:
+ print(f"[WARNING] Invalid time value: {time_value}.")
+ return duration, num
+
+ @staticmethod
+ def calculate_ratio(dividend, divisor):
+ if not divisor:
+ return float("inf")
+ return dividend / divisor
+
+ def path_check(self):
+ if self.base_collection_path:
+ if os.path.exists(self.base_collection_path):
+ self._has_base_collection = True
+ else:
+ print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.")
+ return os.path.exists(self.collection_path)
+
+ def process(self):
+ base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path
+ result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE)
+ for data in result_data.values():
+ self._headers = data.get("headers", [])
+ rows = data.get("rows", [])
+ if len(rows) == 2:
+ self._base_data = rows[0]
+ self._comparison_data = rows[1]
+ if not self._headers or not self._comparison_data:
+ return
+ self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers
+ if self._has_base_collection:
+ self.cur_data["comparison_result"] = result_data
+ time_category_dict = {}
+ for time_category, time_list in self.performance_time_dict.items():
+ time_value = self.get_time_value(time_category, self._comparison_data)
+ if time_value == Constant.INVALID_VALUE:
+ continue
+ duration, _ = self.split_duration_and_num(time_value)
+ time_category = time_category.split("(")[0]
+ time_category_dict[time_category] = duration
+ self.get_sub_category_time(time_category, time_list, duration)
+ self.cur_data["overall_data"] = time_category_dict
+
+ def get_time_value(self, header_name: str, data_list: list):
+ try:
+ data_index = self._headers.index(header_name)
+ except ValueError:
+ return Constant.INVALID_VALUE
+ try:
+ time_value = data_list[data_index]
+ except IndexError:
+ return Constant.INVALID_VALUE
+ return time_value
+
+ def get_sub_category_time(self, category: str, time_list: list, total_duration: float):
+ sub_time_dict = {}
+ for time_name in time_list:
+ time_value = self.get_time_value(time_name, self._comparison_data)
+ if time_value == Constant.INVALID_VALUE:
+ continue
+ sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, ""))
+ duration, num = self.split_duration_and_num(time_value)
+ sub_time_dict.setdefault(f"Duration(s)", []).append(duration)
+ sub_time_dict.setdefault(f"Duration Ratio", []).append(
+ "{:.2%}".format(self.calculate_ratio(duration, total_duration)))
+ sub_time_dict.setdefault(f"Kernel Number", []).append(num)
+ self.cur_data[self.time_name_map.get(category)] = sub_time_dict
+
+ def identify_bottleneck(self):
+ overall_data = self.cur_data.get("overall_data")
+ if not overall_data:
+ return
+ e2e_time = '%.3f' % sum([data for data in overall_data.values()])
+ overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n"
+ comparison_bottleneck = ""
+ for time_type, time_value in overall_data.items():
+ # add subtype time bottleneck
+ self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n"
+ # add overall bottleneck
+ overall_bottleneck += f" -- {time_type} is {time_value}s\n"
+ if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value,
+ e2e_time) > 0.1:
+ overall_bottleneck += "percentage of free time exceed the threshold 10%."
+ if not self._has_base_collection:
+ continue
+ # add comparison bottleneck
+ time_type_origin = "Uncovered Communication Time(Wait Time)" \
+ if time_type == "Uncovered Communication Time" else time_type
+ base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data))
+ if time_value > base_duration:
+ ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration))
+ comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n"
+ self.cur_bottleneck["overall_data"] = overall_bottleneck
+ if comparison_bottleneck:
+ self.cur_bottleneck["comparison_result"] = comparison_bottleneck
+ def optimize(self, **kwargs):
+ if self.path_check():
+ self.process()
+ self.identify_bottleneck()
+ self.format_bottleneck()
+ self.format_cur_data()
+ self.make_record()
+ self.make_render()
+ return self.result
+
+ def format_bottleneck(self):
+ result = ''
+ headers = []
+ data_list = []
+ data = []
+ for key, value in self.cur_bottleneck.items():
+ if not value:
+ continue
+ result += f'{key}: {value} \n'
+ headers.append(key)
+ data.append(value)
+ data_list.append(data)
+ self.bottleneck_str = result
+ self.bottleneck_table["headers"] = headers
+ self.bottleneck_table["data"] = data_list
+
+ def format_cur_data(self):
+ if not self.cur_data:
+ return
+ for data_type, data in self.cur_data.items():
+ if not data:
+ continue
+ if data_type not in list(self.time_name_map.values()):
+ data_list = list(data.values())
+ else:
+ data_list = [','.join(map(str, value)) for value in data.values()]
+ headers = list(data.keys())
+ data_table = {"headers": headers, "data": [data_list]}
+ self.cur_data_table[data_type] = copy.deepcopy(data_table)
+
+
+ def make_record(self):
+ """
+ make record for what and how to optimize
+ """
+ if not self.bottleneck_str and not self.cur_advices:
+ return
+ optimization_item = OptimizeItem(
+ OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER,
+ self.bottleneck_str,
+ self.cur_advices
+ )
+ self.result.add(OptimizeRecord(optimization_item))
+
+ self.result.add_detail(const.BOTTLENECK, self.bottleneck_table["headers"], self.bottleneck_table["data"][0])
+ for data_type, data_dict in self.cur_data_table.items():
+ if data_dict:
+ self.result.add_detail(const.DATA + data_type, data_dict["headers"], data_dict["data"][0])
+
+ def make_render(self):
+ if not self.bottleneck_str and not self.cur_advices:
+ return
+ result_for_html = {
+ "Description" : self.bottleneck_str,
+ "suggestion" : self.cur_advices,
+ "details" : [self.bottleneck_table]
+ }
+
+ self.html_render.render_template(key="overall",
+ title=OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER,
+ template_dir="templates",
+ template_name="cluster_analysis.html",
+ cann_version=self.cann_version,
+ torch_version=self.torch_version,
+ result=result_for_html)
+
+def get_profile_path(collection_path):
+ for root, dirs, files in os.walk(collection_path):
+ for file in files:
+ if file.startswith("profiler_info"):
+ return root
+ return ""
\ No newline at end of file
diff --git a/profiler/advisor_review/analyzer/schedule/__init__.py b/profiler/advisor_review/analyzer/schedule/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/schedule/dispatch/__init__.py b/profiler/advisor_review/analyzer/schedule/dispatch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e62a3ff0c8eebc0cf7b5b89953b8a0842df9c9d
--- /dev/null
+++ b/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+
+from profiler.advisor.common import constant as const
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.display.html.render import HTMLRender
+
+logger = logging.getLogger()
+
+
+class OpDispatchAnalyzer(BaseAnalyzer):
+ dataset_cls_list = [TimelineEventDataset]
+ """
+ operator dispatch optimizer
+ """
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None:
+ super().__init__(collection_path, n_processes, **kwargs)
+ key = TimelineEventDataset.get_key()
+ self.dataset = self.get_first_data_by_key(self.dataset_list, key)
+ self.result = OptimizeResult()
+ self.html_render = HTMLRender()
+ self._op_compile = None
+ self._issues_record = []
+ self.optimization_item = []
+
+ def optimize(self, **kwargs):
+ """
+ optimize operator
+ :param data: input datasets
+ :return: result
+ """
+ self.get_op_compile_info(self.dataset)
+ self.make_record(self.result)
+ self.make_render(self.html_render)
+ return self.result
+
+ def get_op_compile_info(self, event_dataset: TimelineEventDataset):
+ """
+ :Param event_dataset: dataset of timeline event
+ """
+ if hasattr(event_dataset, "ops_compile"):
+ self._op_compile = getattr(event_dataset, "ops_compile")
+ if not self._op_compile or self._op_compile.total_count < const.MAX_OP_COMPILE_NUM:
+ return
+
+ self._issues_record.append(['operator dispatch',
+ const.OP_COMPILE_ID,
+ self._op_compile.total_count,
+ self._op_compile.total_time])
+ else:
+ logger.debug("Skip operator compile checker, because no op_compile attr find.")
+
+ def make_record(self, result: OptimizeResult):
+ """
+ make record for what and how to optimize
+ """
+ if not self._op_compile or len(self._issues_record) <= 0:
+ return
+ desc = f"Found {self._op_compile.total_count} operator compile issues."
+ suggestion = (f"Please use `torch_npu.npu.set_compile_mode(jit_compile=False)` to disable jit compile "
+ f"in dynamic shape usage.")
+ self.optimization_item.append(OptimizeItem("Operator dispatch", desc, [suggestion]))
+ for optimization in self.optimization_item:
+ result.add(OptimizeRecord(optimization))
+ record_title = ["Issues", "op name", "counts", "total time"]
+ result.add_detail('operator dispatch', headers=record_title)
+ for op_info in self._issues_record:
+ result.add_detail('operator dispatch', detail=op_info)
+
+ def make_render(self, html_render):
+ issues = []
+ optimizations = []
+ for optimization in self.optimization_item:
+ optimizations.append(dict(
+ description=optimization.description,
+ suggestion=optimization.suggestion[0]
+ ))
+ for record in self._issues_record:
+ issues.append(dict(issue=record[0],
+ op_name=record[1],
+ counts=record[2],
+ total_time=record[3]))
+ html_render.render_template(key="schedule",
+ template_dir="templates",
+ template_name="operator_dispatch.html",
+ issues=issues,
+ optimizers=optimizations)
diff --git a/profiler/advisor_review/analyzer/schedule/free_event/__init__.py b/profiler/advisor_review/analyzer/schedule/free_event/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/__init__.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1eb24b8e1e11ac167a7eb9333867167a57dd524
--- /dev/null
+++ b/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py
@@ -0,0 +1,271 @@
+import multiprocessing
+import logging
+import re
+
+from tqdm import tqdm
+
+from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer
+from profiler.advisor.common import constant as const
+from profiler.advisor.common.analyzer_scopes import SupportedScopes
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.utils.utils import format_timeline_result
+from profiler.advisor.common.timeline.fusion_ops_db import init_timeline_ops_db
+
+logger = logging.getLogger()
+
+
+class TimelineFusionOpsAnalyzer(BaseAnalyzer):
+ dataset_cls_list = [TimelineEventDataset]
+
+ def __init__(self, collection_path, n_processes: int = 1, **kwargs):
+ super().__init__(collection_path, n_processes, **kwargs)
+ self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict()
+ self.matched_op_stacks = {}
+ self.empty_stacks = True
+ key = TimelineEventDataset.get_key()
+ self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key)
+
+ def optimize(self, **kwargs):
+ for mode in [const.ATEN.lower(), const.OPTIMIZER.lower()]:
+
+ for op_combined, npu_apis in tqdm(getattr(init_timeline_ops_db(self.cann_version, self.torch_version),
+ f"_{mode}_op_api_map").items(), leave=False, ncols=100,
+ desc="Scanning timeline for affinity apis"):
+ for npu_api in npu_apis.split("/"):
+ self.find_fusion_ops(self.timeline_event_dataset, op_combined, npu_api, mode)
+
+ self.query_stack(self.timeline_event_dataset)
+
+ logger.info("Finish timeline analysis")
+ self.make_record()
+ self.make_render()
+ return self.result
+
+ def find_fusion_ops(self, event_dataset, ops: str, npu_api: str, mode: str):
+ """
+ :Param event_dataset: dataset of timeline event
+ :Param ops: operator combination with '-' as separator , e.g. permute-reshape
+ :Param npu_api: api of torch_npu, generally more efficient than torch api
+ :Param mode: aten or dequeue or optimizer
+ :Return: json of op_name and called times and detail stacks
+ """
+ op_rule_pattern, enable_regex = self._format_rule_to_pattern(ops)
+ if not enable_regex:
+ self._match_ops(event_dataset, op_rule_pattern, npu_api, mode)
+ else:
+ try:
+ self._match_ops_with_regex(event_dataset, op_rule_pattern, npu_api, mode)
+ except Exception as e:
+ logger.warning("Failed to find fusion operators with regex %s, reason is %s", ops, e)
+
+ def _match_ops(self, event_dataset, ops: str, npu_api: str, mode: str):
+ """ match operator based on fusion operators rule(without regex),
+ only strictly equals of op name list means matched
+ :Param event_dataset: dataset of timeline event
+ :Param ops: operator combination with '-' as separator , e.g. permute-reshape
+ :Param npu_api: api of torch_npu, generally more efficient than torch api
+ :Param mode: aten or dequeue or optimizer
+ """
+ op_list = ops.split(const.OP_SEP)
+
+ matched_op_index = set()
+ api_ops_matched = False
+
+ for index, event in enumerate(getattr(event_dataset, mode)):
+ if self._replace_op_name_prefix(event.name, mode) != op_list[0]:
+ continue
+ tmp_dequeue_event_names = [self._replace_op_name_prefix(event.name, mode) for event in
+ getattr(event_dataset, mode)[index: index + len(op_list)]]
+ if tmp_dequeue_event_names != op_list:
+ continue
+ api_ops_matched = True
+ matched_op_index.add(event.dataset_index)
+
+ if api_ops_matched:
+ self._matched_op_index[npu_api + f":{ops}"] = matched_op_index
+
+ def _match_ops_with_regex(self, event_dataset, op_rule_pattern: str, npu_api: str,
+ mode: str):
+ """ match operator based on fusion operators rule(with regex),
+ using regex to support condition like 'a = torch.mul(xxx) if xxx else torch.add(xxx)'
+ :Param event_dataset: dataset of timeline event
+ :Param op_rule_pattern: fusion operators rule with regex definition , e.g. add-mul{0,10}, add-mul*
+ :Param npu_api: api of torch_npu, generally more efficient than torch api
+ :Param mode: aten or dequeue or optimizer
+ """
+ matched_op_index = set()
+ total_op_name = "".join([f"{const.OP_SEP}{self._replace_op_name_prefix(event.name, mode)}{const.OP_SEP}"
+ for event in
+ getattr(event_dataset, mode)])
+
+ matched_pattern_index_tuple = [(x.start(0), x.end(0)) for x in re.finditer(op_rule_pattern, total_op_name)]
+ # convert list of index tuple to a whole list: [(3, 25), ...] -> [3, 25, ...]
+ total_ops_split_points = [num for sublist in matched_pattern_index_tuple for num in sublist]
+
+ api_ops_matched = len(total_ops_split_points) != 0
+
+ op_index = []
+ if 0 not in total_ops_split_points:
+ total_ops_split_points = [0] + total_ops_split_points
+ if len(list(total_op_name)) not in total_ops_split_points:
+ total_ops_split_points.append(len(list(total_op_name)))
+
+ # convert total ops name like "-add-mul-xxx-div-" to small pieces like [["add", "mul"], [...], ["div"]]
+ # by the regex index and then calculate the real index for matched fusion operators in event dataset
+ for l, r in zip(total_ops_split_points, total_ops_split_points[1:]):
+ matched_op_flag = True if (l, r) in matched_pattern_index_tuple else False
+ matched_ops_list = total_op_name[l: r].strip(const.OP_SEP).split(const.OP_SEP + const.OP_SEP)
+ op_index.append([matched_op_flag, len(matched_ops_list)])
+ for i, _ in enumerate(op_index):
+ if i > 0:
+ # calculate cumsum for indexing matched operator
+ op_index[i][1] = op_index[i][1] + op_index[i - 1][1]
+ op_index = [[False, 0]] + op_index
+
+ for i, _ in enumerate(op_index):
+ if not op_index[i][0]:
+ continue
+ index = op_index[i - 1][1]
+ matched_op_index.add(index)
+
+ if index > len(getattr(event_dataset, mode)) - 1:
+ continue
+ dataset_index = getattr(event_dataset, mode)[index].get("dataset_index")
+ matched_op_index.add(dataset_index)
+
+ if api_ops_matched:
+ self._matched_op_index[npu_api + f":{op_rule_pattern}"] = sorted(list(matched_op_index))
+
+ def make_record(self):
+ """
+ make record for what and how to optimize
+ """
+ if not self.matched_op_stacks:
+ return
+
+ desc = f"Found {len(format_timeline_result(self.matched_op_stacks))} apis to be replaced" \
+ f" based on the runtime env cann-{self.cann_version} and torch-{self.torch_version}"
+ suggestion = "Please replace training api according to sub table 'Affinity training api'"
+ if self.empty_stacks:
+ desc += ", but with no stack"
+ suggestion = const.TIMELINE_EMPTY_STACKS_PROMPT.format(
+ timeline_profiling_doc_url=const.TIMELINE_WITH_STACK_DOC_URL
+ )
+
+ optimization_item = OptimizeItem(
+ SupportedScopes.TIMELINE_FUSION_OPS,
+ desc,
+ [suggestion]
+ )
+
+ self.result.add(OptimizeRecord(optimization_item))
+
+ record_title = ["Affinity API", "Code stacks", "Stack called counts"]
+ self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, headers=record_title)
+
+ for api_name, stacks_info in format_timeline_result(self.matched_op_stacks).items():
+ if not stacks_info:
+ detail = [api_name, "null", "null"]
+ self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail)
+ else:
+ for stack in stacks_info:
+ detail = [api_name, *stack]
+ self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail)
+
+ def make_render(self):
+ format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True)
+
+ self.html_render.render_template(key="schedule",
+ template_dir="templates",
+ template_name="affinity_api.html",
+ cann_version=self.cann_version,
+ torch_version=self.torch_version,
+ empty_stacks=self.empty_stacks,
+ with_stack_doc_url=const.TIMELINE_WITH_STACK_DOC_URL,
+ api_doc_url=const.TIMELINE_API_DOC_URL,
+ result=format_result_for_html)
+
+ def query_stack(self, event_dataset):
+ if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]):
+ return
+
+ op_stack_list = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index)
+ for op_stack in op_stack_list:
+ for op_rule, stack in op_stack.items():
+ if op_rule not in self.matched_op_stacks:
+ self.matched_op_stacks[op_rule] = {}
+ if stack == const.TIMELINE_FUSION_OPS_NO_STACK_FLAG:
+ continue
+ if stack not in self.matched_op_stacks[op_rule]:
+ self.matched_op_stacks[op_rule][stack] = 0
+ self.matched_op_stacks[op_rule][stack] += 1
+
+ def _query_stack_by_matched_index(self, index, event):
+ stack_record = {}
+ event = TimelineEvent(event)
+
+ matched_op_rules = []
+ for op_rule, matched_index in self._matched_op_index.items():
+ if index not in matched_index:
+ continue
+
+ matched_op_rules.append(op_rule)
+ stack = event.args.get(const.CALL_STACKS)
+
+ if not stack:
+ logger.debug("Got empty '%s' for event %s", const.CALL_STACKS, event)
+ continue
+
+ if self.empty_stacks and stack:
+ self.empty_stacks = False
+
+ stack_record[op_rule] = stack
+
+ if matched_op_rules and not stack_record:
+ for op_rule in matched_op_rules:
+ stack_record[op_rule] = const.TIMELINE_FUSION_OPS_NO_STACK_FLAG
+
+ return stack_record
+
+ def _replace_op_name_prefix(self, event_name, mode):
+ if mode == const.DEQUEUE.lower():
+ op_name_prefix = f"{const.DEQUEUE}{const.DEQUEUE_SEP}"
+ elif mode == const.ATEN:
+ op_name_prefix = f"{const.ATEN}{const.ATEN_SEP}"
+ else:
+ op_name_prefix = f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}"
+
+ return event_name.replace(op_name_prefix, "")
+
+ def _format_rule_to_pattern(self, op_rule):
+ """
+ Args:
+ op_rule: like (mul){0,1}-(add|neg){0,2}-dropout-(softmax)*
+
+ Returns: op_pattern like (-mul-){0,1}(-add-|-neg-){0,2}(-dropout-)(-softmax-)*
+ """
+ enable_regex = False
+ if "(" not in op_rule and ")" not in op_rule:
+ # op_rule which requires fuzzy matching mush consist of "()"
+ return op_rule, enable_regex
+
+ enable_regex = True
+ op_pattern_list = op_rule.split(const.OP_SEP)
+ format_op_pattern = ""
+ for op_pattern in op_pattern_list:
+ matched_res = re.search(r'\((.*?)\)', op_pattern)
+
+ ops_index_range = (matched_res.start() + 1, matched_res.end() - 1) if matched_res else (
+ 0, len(op_pattern))
+
+ op_names = op_pattern[ops_index_range[0]: ops_index_range[1]]
+ tmp_op_names_record = []
+ for op_name in op_names.split("|"):
+ tmp_op_names_record.append(f"{const.OP_SEP}{op_name.strip(' ')}{const.OP_SEP}")
+ op_suffix = op_pattern[ops_index_range[1] + 1:]
+ op_names_format = f"({'|'.join(tmp_op_names_record)}){op_suffix}"
+
+ format_op_pattern += op_names_format
+ return format_op_pattern, enable_regex
diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..f684a4892111f113f6c502a010c9e14ccd43768a
--- /dev/null
+++ b/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py
@@ -0,0 +1,163 @@
+import logging
+from typing import List
+
+from profiler.advisor.common import constant as const
+from profiler.advisor.common.timeline.event import TimelineEvent
+from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset
+from profiler.advisor.result.result import OptimizeResult
+from profiler.advisor.result.item import OptimizeItem, OptimizeRecord
+from profiler.advisor.utils.utils import get_analyze_processes, ParallelJob
+
+logger = logging.getLogger()
+
+
+class OpStackFinder:
+
+ def __init__(self):
+ self.n_processes = get_analyze_processes()
+ self._stack_record = []
+ self._task_id_record = {}
+ self.op_name = None
+ self.task_type = None
+ self.matched_index = set()
+
+ def get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: List[str] = None, task_type: str = None,
+ disable_multiprocess=False):
+ """
+ :Param event_dataset: dataset of timeline event
+ :Param op_name: operator name, e.g. IndexPutV2
+ :Param task_type: operator task type, optionals are AI_CPU and AI_CORE
+ :Param disable_multiprocess: disable multiprocessing, avoid cost time of enable new process for light task
+ """
+ if not op_name:
+ op_name = []
+ if not isinstance(op_name, list):
+ op_name = [op_name]
+
+ self.op_name = ",".join(op_name)
+ self.task_type = task_type
+ op_name_list = event_dataset.task_op_names if not op_name else op_name
+
+ if self.n_processes <= 1 or disable_multiprocess:
+ self._query_stacks_multiprocess(event_dataset, op_name_list, task_type)
+ else:
+ event_num_per_process = int(len(op_name_list) / self.n_processes) + 1
+ parallel_analyzer = ParallelJob(
+ self._query_stacks_multiprocess,
+ [[event_dataset, op_name_list[i:i + event_num_per_process], task_type]
+ for i in range(0, len(op_name_list), event_num_per_process)],
+ job_name="Analyzing operator stacks from timeline"
+ )
+ parallel_analyzer.start(self.n_processes)
+ self.query_stack(event_dataset)
+
+ def make_record(self, result: OptimizeResult):
+ """
+ make record for what and how to optimize
+ """
+ if not self._stack_record:
+ return
+
+ desc = f"Found {len(self._stack_record)} called stacks for"
+ if self.op_name and self.task_type:
+ desc += f" operators with name '{self.op_name}' with task type '{self.task_type}'"
+ elif self.op_name and not self.task_type:
+ desc += f" operators with name '{self.op_name}'"
+ elif self.task_type and not self.op_name:
+ desc += f" operators with task type '{self.task_type}'"
+ else:
+ desc += " all operators"
+
+ suggestion = f"Please use command 'ma-advisor analyze profiling' to analyze operators"
+ optimization_item = OptimizeItem(
+ "Operator stacks",
+ desc,
+ [suggestion]
+ )
+ result.add(OptimizeRecord(optimization_item))
+
+ record_title = ["Task ID", "op name", "op type", "code stacks"]
+ result.add_detail('operator stacks', headers=record_title)
+
+ for op_info in self._stack_record:
+ result.add_detail('operator stacks', detail=op_info)
+
+ def _get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: str, task_type: str):
+ for _, src_op_event in event_dataset.ops_with_task_type.items():
+
+ op_task_type = src_op_event.get(const.TASK_TYPE)
+ if not (src_op_event.name == op_name and op_task_type and op_task_type == task_type):
+ continue
+
+ torch_to_npu_key = f"s-{src_op_event.tid}-{src_op_event.ts}"
+ torch_to_npu_event = event_dataset.torch_to_npu.get(torch_to_npu_key) or event_dataset.torch_to_npu.get(
+ f"s-{src_op_event.ts}") or event_dataset.torch_to_npu.get(f"s-{src_op_event.ts.replace('.', '')}")
+
+ acl_to_npu_event = src_op_event.ts in event_dataset.acl_to_npu
+
+ if not torch_to_npu_event and not acl_to_npu_event:
+ continue
+
+ # query stack by torch_to_npu first, due to each operator had acl_to_npu incoming flow in cann6.3
+ if torch_to_npu_event:
+ dst_op_index = self._query_index_by_torch_to_npu(event_dataset, torch_to_npu_event)
+ else:
+ dst_op_index = self._query_index_by_acl_to_npu(acl_to_npu_event)
+
+ if not dst_op_index:
+ continue
+
+ task_id = src_op_event.task_id
+ if not task_id:
+ continue
+ self.matched_index.add(dst_op_index)
+ if dst_op_index not in self._task_id_record:
+ self._task_id_record[dst_op_index] = []
+ self._task_id_record[dst_op_index].append([task_id, op_name, task_type])
+
+ def _query_index_by_torch_to_npu(self, event_dataset, torch_to_npu_event):
+ dst_op_event_key = torch_to_npu_event.ts
+ dst_op_event = event_dataset.ops_with_stack.get(dst_op_event_key)
+
+ if not dst_op_event:
+ return const.TIMELINE_BACKWARD_NO_STACK_CODE
+
+ return dst_op_event.get("dataset_index")
+
+ def _query_index_by_acl_to_npu(self, acl_to_npu_event):
+ if acl_to_npu_event:
+ return const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE
+
+ def _query_stacks_multiprocess(self, event_dataset, op_name_list, task_type):
+
+ for op_name in op_name_list:
+ if task_type is not None:
+ self._get_api_stack_by_op(event_dataset, op_name, task_type)
+ else:
+ self._get_api_stack_by_op(event_dataset, op_name, const.AI_CORE)
+ self._get_api_stack_by_op(event_dataset, op_name, const.AI_CPU)
+
+ def _format_stack_record(self):
+ stack_list = []
+ for task_id, stack_info in self._task_id_record.items():
+ stack_list.append([task_id, *stack_info])
+ return stack_list
+
+ def _query_stack_by_matched_index(self, index, event):
+ if index not in self.matched_index:
+ return None
+ event = TimelineEvent(event)
+ stack = event.args.get(const.CALL_STACKS)
+ stack = stack if stack else const.NO_STACK_REASON_MAP.get(const.TIMELINE_BACKWARD_NO_STACK_CODE)
+ for matched_op_info in self._task_id_record.get(index, []):
+ self._stack_record.append([*matched_op_info, stack])
+
+ for matched_op_info in self._task_id_record.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE, []):
+ self._stack_record.append([*matched_op_info,
+ const.NO_STACK_REASON_MAP.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE)])
+ return None
+
+ def query_stack(self, event_dataset: TimelineEventDataset):
+ if not event_dataset.dataset_len:
+ return
+ _ = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index)
diff --git a/profiler/advisor_review/cluster_perf_analysis.ipynb b/profiler/advisor_review/cluster_perf_analysis.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..7ee0b24e85467fe42205c5986095a7e66bf0a636
--- /dev/null
+++ b/profiler/advisor_review/cluster_perf_analysis.ipynb
@@ -0,0 +1,1042 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "initial_id",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-21T13:31:25.022339600Z",
+ "start_time": "2023-11-21T13:31:25.016155200Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.append(\"../..\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "c552da9d-36f9-43d3-ae1f-c54f78d3ff2d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from profiler.advisor.interface.interface import Interface\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from prettytable import PrettyTable, ALL\n",
+ "from textwrap import fill"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "57d17a21205c3c5e",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "source": [
+ "# 集群调优分析\n",
+ "## 1. 集群分析的数据准备\n",
+ "首先我们当前支持PyTorch多卡大模型的集群分析,您需要输入集群分析的profiling_path路径,例如: \n",
+ "--{profiling_path} \n",
+ " -- xxxx_ascend_pt \n",
+ " -- xxxx_ascend_pt \n",
+ " -- xxxx_ascend_pt \n",
+ " ...... \n",
+ " -- xxxx_ascend_pt \n",
+ "里面每张卡的profiling文件都是ascend_pt结尾的文件。 \n",
+ "\n",
+ "## 2. 集群分析解决的问题 \n",
+ "当前的功能主要有四项: \n",
+ "1). 识别多卡间的计算慢卡(根据计算时间等推断) \n",
+ "2). 识别多卡间的通信慢现象(根据通信链路的带宽判断) \n",
+ "3). 对多卡间的计算算子进行统计展示(识别不同卡的算子差异) \n",
+ "4). 展示集群流水并行图(根据时间轴展示多卡间的计算和通信时间) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "36b7a24cc7ca5da2",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-21T12:53:38.379699800Z",
+ "start_time": "2023-11-21T12:53:38.363755900Z"
+ },
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# EDIT THE PROFILING DATA PATH\n",
+ "cluster_path = r\"YOUR PROFILING PATH\"\n",
+ "interface = Interface(profiling_path=cluster_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cf832ac2e0dfa30f",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "source": [
+ "## 1) 识别慢卡"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "40aac93278dd6e34",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-11-21T12:53:41.815599700Z",
+ "start_time": "2023-11-21T12:53:41.783393700Z"
+ },
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[INFO]Cluster has been analyzed because of the existence of cluster analysis output directory.\n",
+ "[INFO]Skip Cluster analyze backend.\n"
+ ]
+ }
+ ],
+ "source": [
+ "slow_rank_result = interface.get_result(\"cluster\", \"slow_rank\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "0e943b2a-37a6-4db6-9e70-235d397f1d39",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
rank_id | \n", + "compute | \n", + "communication | \n", + "free | \n", + "
---|---|---|---|
0 | \n", + "28976239.07999987 | \n", + "7586795.419999811 | \n", + "6836641.679994211 | \n", + "
1 | \n", + "29012279.100000102 | \n", + "6984613.220000025 | \n", + "7388343.859991224 | \n", + "
2 | \n", + "29019115.32300051 | \n", + "7489956.633000028 | \n", + "6881360.253991371 | \n", + "
3 | \n", + "29027089.560000077 | \n", + "7963312.239999794 | \n", + "6389981.899993688 | \n", + "
4 | \n", + "29044786.93699965 | \n", + "6533618.639000017 | \n", + "7780517.1539908135 | \n", + "
5 | \n", + "29178186.259999853 | \n", + "7925184.420000028 | \n", + "6286867.999995028 | \n", + "
6 | \n", + "29025331.189999904 | \n", + "6386639.90799992 | \n", + "7941798.704992032 | \n", + "
7 | \n", + "29056803.304999545 | \n", + "7234444.826000024 | \n", + "7094608.035991492 | \n", + "
8 | \n", + "31383314.980000228 | \n", + "3973806.6169999996 | \n", + "8017981.379989724 | \n", + "
9 | \n", + "31360536.36200019 | \n", + "4757458.825000002 | \n", + "7277062.386991671 | \n", + "
10 | \n", + "31381891.800000463 | \n", + "5276870.359999998 | \n", + "6731073.659992552 | \n", + "
11 | \n", + "31387777.38000033 | \n", + "4727362.3000000045 | \n", + "7297578.339992355 | \n", + "
12 | \n", + "31374132.74499977 | \n", + "5164443.388000004 | \n", + "6829798.933991944 | \n", + "
13 | \n", + "31377800.178999804 | \n", + "4360616.283000001 | \n", + "7624691.509991412 | \n", + "
14 | \n", + "31374658.360000316 | \n", + "4457099.620000001 | \n", + "7542724.319990785 | \n", + "
15 | \n", + "31387255.527000006 | \n", + "5000860.905 | \n", + "6975264.115991174 | \n", + "
problem | \n", + "description | \n", + "
---|---|
slow_rank_analysis | \n", + "Computing has some issues in the cluster, because the max difference of Computing time has reached 2411.538ms. Communication has some issues in the cluster, because the max difference of Communication time has reached 3989.506ms. | \n",
+ "
rank_id | \n", + "RDMA bandwidth(GB/s) | \n", + "RDMA size(mb) | \n", + "RDMA time(ms) | \n", + "SDMA bandwidth(GB/s) | \n", + "SDMA size(mb) | \n", + "SDMA time(ms) | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.7668 | \n", + "42507.3469439998 | \n", + "4352.225880000002 | \n", + "
1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "10.1653 | \n", + "42507.346775999795 | \n", + "4181.611080000001 | \n", + "
2 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "10.471 | \n", + "42507.346775999795 | \n", + "4059.527798999999 | \n", + "
3 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.9691 | \n", + "42507.346775999795 | \n", + "4263.9230400000015 | \n", + "
4 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.1469 | \n", + "42507.346775999795 | \n", + "4647.202435000001 | \n", + "
5 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.4663 | \n", + "42507.346775999795 | \n", + "4490.373999999999 | \n", + "
6 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.5692 | \n", + "42507.346775999795 | \n", + "4442.106745000001 | \n", + "
7 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "9.8444 | \n", + "42507.346775999795 | \n", + "4317.931616999999 | \n", + "
8 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.895 | \n", + "42507.389952 | \n", + "2249.662369 | \n", + "
9 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.9112 | \n", + "42507.39080800006 | \n", + "2247.7420159999997 | \n", + "
10 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.7713 | \n", + "42507.39080800006 | \n", + "2264.48576 | \n", + "
11 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.8389 | \n", + "42507.39080800006 | \n", + "2256.3606000000004 | \n", + "
12 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.7687 | \n", + "42507.39080800006 | \n", + "2264.8021099999996 | \n", + "
13 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.9717 | \n", + "42507.39080800006 | \n", + "2240.5713950000004 | \n", + "
14 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.9226 | \n", + "42507.39080800006 | \n", + "2246.381839999999 | \n", + "
15 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "18.8346 | \n", + "42507.39080800006 | \n", + "2256.8781 | \n", + "
problem | \n", + "description | \n", + "
---|---|
slow_rank_analysis | \n", + "Computing has some issues in the cluster, because the max difference of Computing time has reached 2411.538ms. Communication has some issues in the cluster, because the max difference of Communication time has reached 3989.506ms. | \n",
+ "
slow_link_analysis | \n", + "SDMA bandwidth(GB/s): The average is 14.332, while the maximum is 18.972GB/s and the minimum is 9.147GB/s. the difference is 9.825GB/s. | \n",
+ "
\n", + " | rank id | \n", + "Name | \n", + "Input Shapes | \n", + "Input Data Types | \n", + "Output Shapes | \n", + "Duration(us)_mean | \n", + "Duration(us)_var | \n", + "Duration(us)_max | \n", + "Duration(us)_min | \n", + "Duration(us)_count | \n", + "Duration(us)_sum | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "Add100 | \n", + "\"4096,10880;4096,10880\" | \n", + "FLOAT;FLOAT | \n", + "\"4096,10880\" | \n", + "478.210918 | \n", + "237.729252 | \n", + "721.420 | \n", + "449.80 | \n", + "1024 | \n", + "489687.980 | \n", + "
1 | \n", + "0 | \n", + "Add102 | \n", + "\"21760;21760\" | \n", + "FLOAT;FLOAT | \n", + "\"21760\" | \n", + "4.390391 | \n", + "0.011915 | \n", + "4.820 | \n", + "3.98 | \n", + "1024 | \n", + "4495.760 | \n", + "
2 | \n", + "0 | \n", + "Add106 | \n", + "\"21760,4096;21760,4096\" | \n", + "FLOAT;FLOAT | \n", + "\"21760,4096\" | \n", + "933.504395 | \n", + "462.979321 | \n", + "1257.140 | \n", + "927.38 | \n", + "1024 | \n", + "955908.500 | \n", + "
3 | \n", + "0 | \n", + "Add111 | \n", + "\"4096,4096;4096,4096\" | \n", + "FLOAT;FLOAT | \n", + "\"4096,4096\" | \n", + "91.267363 | \n", + "2.158275 | \n", + "97.120 | \n", + "85.12 | \n", + "1024 | \n", + "93457.780 | \n", + "
4 | \n", + "0 | \n", + "Add118 | \n", + "\"12288,4096;12288,4096\" | \n", + "FLOAT;FLOAT | \n", + "\"12288,4096\" | \n", + "526.312012 | \n", + "1462.617511 | \n", + "787.780 | \n", + "424.24 | \n", + "1024 | \n", + "538943.500 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
2513 | \n", + "15 | \n", + "trans_Cast_12 | \n", + "\"4096,1,1,128\" | \n", + "FLOAT | \n", + "\"4096,1,1,128\" | \n", + "8.486495 | \n", + "0.060174 | \n", + "9.820 | \n", + "8.20 | \n", + "2048 | \n", + "17380.342 | \n", + "
2514 | \n", + "15 | \n", + "trans_Cast_13 | \n", + "\"4096,1,1,128\" | \n", + "FLOAT | \n", + "\"4096,1,1,128\" | \n", + "10.534564 | \n", + "0.166380 | \n", + "12.900 | \n", + "9.48 | \n", + "2048 | \n", + "21574.787 | \n", + "
2515 | \n", + "15 | \n", + "trans_Cast_14 | \n", + "\"4096,1,1,128\" | \n", + "FLOAT | \n", + "\"4096,1,1,128\" | \n", + "9.784551 | \n", + "0.295368 | \n", + "13.021 | \n", + "8.56 | \n", + "2048 | \n", + "20038.761 | \n", + "
2516 | \n", + "15 | \n", + "trans_Cast_15 | \n", + "\"4096,1,1,128\" | \n", + "DT_BF16 | \n", + "\"4096,1,1,128\" | \n", + "8.342211 | \n", + "0.120471 | \n", + "10.220 | \n", + "7.86 | \n", + "2048 | \n", + "17084.848 | \n", + "
2517 | \n", + "15 | \n", + "trans_Cast_16 | \n", + "\"4096,1,1,128\" | \n", + "DT_BF16 | \n", + "\"4096,1,1,128\" | \n", + "9.507589 | \n", + "0.117111 | \n", + "11.681 | \n", + "9.18 | \n", + "2048 | \n", + "19471.543 | \n", + "
2518 rows × 11 columns
\n", + "problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_type | \n", + "task_duration | \n", + "income | \n", + "block_dim | \n", + "mix_block_dim | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.76 | \n", + "0 | \n", + "16 | \n", + "0 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.24 | \n", + "0 | \n", + "16 | \n", + "0 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/lm_head-Linear/MatMul-op213 | \n", + "MatMulV2 | \n", + "AI_CORE | \n", + "39.02 | \n", + "0 | \n", + "20 | \n", + "0 | \n", + ""128,128;128,32000" | \n", + "FLOAT16;FLOAT16 | \n", + "FORMAT_ND;FORMAT_ND | \n", + ""128,32000" | \n", + "FLOAT | \n", + "FORMAT_ND | \n", + "
problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
operator no bound | \n", + "There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 95 | \n", + "814.0199999999999 | \n", + "0.7985 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_type | \n", + "task_duration | \n", + "vec_ratio | \n", + "mac_ratio | \n", + "scalar_ratio | \n", + "mte1_ratio | \n", + "mte2_ratio | \n", + "mte3_ratio | \n", + "block_dim | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.76 | \n", + "0.4654 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0056 | \n", + "16 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78 | \n",
+ " Square | \n", + "AI_VECTOR_CORE | \n", + "42.24 | \n", + "0.466 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0.0062 | \n", + "16 | \n", + ""128,128" | \n", + "FLOAT | \n", + "NCHW | \n", + ""128,1" | \n", + "FLOAT | \n", + "NCHW | \n", + "
Default/lm_head-Linear/MatMul-op213 | \n", + "MatMulV2 | \n", + "AI_CORE | \n", + "39.02 | \n", + "0 | \n", + "0.1105 | \n", + "0.0119 | \n", + "0.0857 | \n", + "0.4284 | \n", + "0 | \n", + "20 | \n", + ""128,128;128,32000" | \n", + "FLOAT16;FLOAT16 | \n", + "FORMAT_ND;FORMAT_ND | \n", + ""128,32000" | \n", + "FLOAT | \n", + "FORMAT_ND | \n", + "
problem | \n", + "description | \n", + "suggestion | \n", + "problem count | \n", + "total_time(us) | \n", + "time ratio | \n", + "income(us) | \n", + "income ratio | \n", + "
---|---|---|---|---|---|---|---|
block dim | \n", + "some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 101 | \n", + "814.0199999999999 | \n", + "1.0 | \n", + "\n", + " | \n", + " |
operator no bound | \n", + "There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | \n",
+ " 1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi sor\\operator_tuning_file_20240613153259.cfg' | \n",
+ " 95 | \n", + "814.0199999999999 | \n", + "0.7985 | \n", + "\n", + " | \n", + " |
AICPU operator | \n", + "Some operators and task duration exceed 20 us, such as : Cast | \n", + "1. Modify code to avoid aicpu operator | \n", + "39 | \n", + "686568.860000001 | \n", + "0.0189 | \n", + "\n", + " | \n", + " |
op_name | \n", + "op_type | \n", + "task_duration | \n", + "input_shapes | \n", + "input_data_types | \n", + "input_formats | \n", + "output_shapes | \n", + "output_data_types | \n", + "output_formats | \n", + "stack_info | \n", + "
---|---|---|---|---|---|---|---|---|---|
trans_Cast_5 | \n", + "Cast | \n", + "493.64 | \n", + """ | \n", + "INT32 | \n", + "FORMAT_ND | \n", + """ | \n", + "UINT64 | \n", + "FORMAT_ND | \n", + "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): dropout; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/dropout.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/module.py(184): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; ../../pretrain_gpt.py(88): forward_step; /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; /home/s30040711/Megatron- LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(96): forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): train_step; /profiling_auto_GPT3/megatron/training.py(837): train; /profiling_auto_GPT3/megatron/training.py(152): pretrain; ../../pretrain_gpt.py(122): <module> | \n",
+ "
trans_Cast_5 | \n", + "Cast | \n", + "413.4 | \n", + """ | \n", + "INT32 | \n", + "FORMAT_ND | \n", + """ | \n", + "UINT64 | \n", + "FORMAT_ND | \n", + "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): dropout; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/dropout.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/module.py(184): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; ../../pretrain_gpt.py(88): forward_step; /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; /home/s30040711/Megatron- LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(109): forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): train_step; /profiling_auto_GPT3/megatron/training.py(837): train; /profiling_auto_GPT3/megatron/training.py(152): pretrain; ../../pretrain_gpt.py(122): <module> | \n",
+ "
{{ header }} | + {% endfor %} +|
{{ element|round(2) }} | + {% else %} +{{ element }} | + {% endif %} + {% endfor %} +
{{ header }} | + {% endfor %} +|
{{ element|round(2) }} | + {% else %} +{{ element }} | + {% endif %} + {% endfor %} +
Structure | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ node.fusion_pattern|safe }} | +{{ node.counts|safe }} | +{{ node.total_duration|safe }} | +
OP Name | +OP Type | +Elapsed Time(us) | +
---|---|---|
{{ node.op_name|safe }} | +{{ node.dtype|safe }} | +{{ node.duration|safe }} | +
Description | +Suggestion | +Elapsed Time(us) | +Time Ratio | +
---|---|---|---|
{{ format_result.record.optimization_item.description|safe }} | +{{ format_result.suggestion|safe }} | +{{ format_result.task_duration|safe }} | +{{ format_result.record.statistics_item.task_duration_ratio|safe }} | +
Operator Type | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ op_info.summary.op_type|safe }} | +{{ op_info.summary.counts|safe }} | +{{ op_info.summary.total_duration|safe }} | +
+ Suggestion {{ loop.index|safe }}: {{suggestion|safe}} +
+ {% endfor %} +Suggestion 1: Modify code to avoid AICPU operator
+ {% endif %} + + {{ info.op_info_list[0].stack_info|safe }} +Description | +Suggestion | +Elapsed Time(us) | +Time Ratio | +
---|---|---|---|
{{ format_result.record.optimization_item.description|safe }} | +{{ format_result.suggestion|safe }} | +{{ format_result.task_duration|safe }} | +{{ format_result.record.statistics_item.task_duration_ratio|safe }} | +
Operator Type | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ op_info.summary.op_type|safe }} | +{{ op_info.summary.counts|safe }} | +{{ op_info.summary.total_duration|safe }} | +
Description | +Suggestion | +
---|---|
{{ optimizer.description |safe }} | +{{ optimizer.suggestion|safe }} | +
Issue | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ issue.op_name |safe }} | +{{ issue.counts |safe }} | +{{ issue.total_time |safe }} | +
Description | +Suggestion | +
---|---|
{{ format_result.record.optimization_item.description|safe }} | +{{ format_result.suggestion|safe }} | +
Description | +Suggestion | +Elapsed Time(us) | +Time Ratio | +
---|---|---|---|
{{ format_result.record.optimization_item.description|safe }} | +{{ format_result.suggestion|safe }} | +{{ format_result.task_duration|safe }} | +{{ format_result.record.statistics_item.task_duration_ratio|safe }} | +
Operator Type | +Counts | +Elapsed Time(us) | +
---|---|---|
{{ op_info.summary.op_type|safe }} | +{{ op_info.summary.counts|safe }} | +{{ op_info.summary.total_duration|safe }} | +
{{ header }} | + {% endfor %} +
{{ element }} | + {% endfor %} +
problem | \n", + "description | \n", + "suggestion | \n", + "
---|---|---|
timeline_fusion_ops | \n", + "Found 2 apis to be replaced based on the runtime env cann-8.0.0 and torch-2.1.0 | \n", + "1. Please replace training api according to sub table 'Affinity training api' | \n", + "
Affinity API | \n", + "Code stacks | \n", + "Stack called counts | \n", + "
---|---|---|
optimizer.clip_grad_norm_fused_ | \n", + "/home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- packages/torch/nn/utils/clip_grad.py(49): clip_grad_norm_; /home/ma- user/work/algorithms/doc_cls/Bert.py(205): train_epoch; /home/ma- user/work/algorithms/doc_cls/Bert.py(252): <module> | \n",
+ " 2 | \n", + "
torch_npu.optim.NpuFusedAdamW | \n", + "/home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- packages/torch_npu/npu/profiler.py(675): __enter__; /home/ma- user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- packages/torch_npu/npu/profiler.py(719): wrapper; /home/ma- user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- packages/torch/optim/lr_scheduler.py(65): wrapper; /home/ma- user/work/algorithms/doc_cls/Bert.py(219): train_epoch; /home/ma- user/work/algorithms/doc_cls/Bert.py(252): <module> | \n",
+ " 2 | \n", + "
Framework Time Stats
\"))\n", + " display(fwk_df)\n", + " cluster_display.display_duration_boxplots(figs, fwk_df, title=\"Framework Time\", x_title=\"Name\", y_title=\"Time\")\n", + " display(HTML(\"Cann Time Stats
\"))\n", + " display(cann_df)\n", + " cluster_display.display_duration_boxplots(figs, cann_df, title=\"Cann Time\", x_title=\"Name\", y_title=\"Time\")\n", + " display(HTML(\"Device Time Stats
\"))\n", + " display(device_df)\n", + " cluster_display.display_duration_boxplots(figs, device_df, title=\"Device Time\", x_title=\"Name\", y_title=\"Time\")\n", + "\n", + "steps = list(all_fwk_stats_gdf.groups.keys())\n", + "if steps:\n", + " cluster_display.display_stats_optional_combobox(steps, display_stats_mstx_step_combobox, \n", + " [all_fwk_stats_gdf, all_cann_stats_gdf, all_device_stats_gdf], \"Step:\")\n", + "else:\n", + " print(\"There is no step in stats, so no need to display\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群Rank MSTX数据分析\n", + "\n", + "将集群内每个Rank的MSTX数据进行汇总,按打点Name分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Name:打点名称\n", + "- FrameworkDuration(Us):框架侧耗时\n", + "- CannDuration(Us):Cann侧耗时\n", + "- DeviceDuration(Us):Device侧耗时\n", + "- Rank:Rank序号\n", + "- StepId:Step序号" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def display_mstx_duration_by_rank(selected, args):\n", + " mark_stats_gdf = args\n", + " df = mark_stats_gdf.get_group(selected).sort_values(\"Rank\")\n", + " display(df)\n", + " fwk_duration = []\n", + " cann_duration = []\n", + " device_duration = []\n", + " step_ids = []\n", + " for step_id, step_df in df.groupby(\"StepId\"):\n", + " fwk_duration.append((step_id, step_df[\"FrameworkDuration(Us)\"].values))\n", + " cann_duration.append((step_id, step_df[\"CannDuration(Us)\"].values))\n", + " device_duration.append((step_id, step_df[\"DeviceDuration(Us)\"].values))\n", + " step_ids.append(step_id)\n", + " fwk_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in fwk_duration], axis=1)\n", + " cann_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in cann_duration], axis=1)\n", + " device_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in device_duration], axis=1)\n", + " figs = []\n", + " ranks = df[\"Rank\"].drop_duplicates()\n", + " cluster_display.display_graph(figs, ranks, fwk_df[step_ids],\n", + " title=\"Framework Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + " cluster_display.display_graph(figs, ranks, cann_df[step_ids],\n", + " title=\"Cann Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + " cluster_display.display_graph(figs, ranks, device_df[step_ids],\n", + " title=\"Device Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + "\n", + "mark_stats_gdf = mark_stats_df.groupby(mark_stats_df.index)\n", + "names = list(mark_stats_gdf.groups.keys())\n", + "if steps:\n", + " cluster_display.display_stats_optional_combobox(names, display_mstx_duration_by_rank, mark_stats_gdf, \"Name:\")\n", + "else:\n", + " print(\"There is no mark name in stats, so no need to display\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse_review/analysis/step_trace_time_analysis.py b/profiler/cluster_analyse_review/analysis/step_trace_time_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..6a886fffa97b142e8267066117f561154d85b162 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/step_trace_time_analysis.py @@ -0,0 +1,126 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from common_func.db_manager import DBManager +from common_func.constant import Constant +from common_func.file_manager import FileManager +from prof_bean.step_trace_time_bean import StepTraceTimeBean + + +class StepTraceTimeAnalysis: + CLUSTER_TRACE_TIME_CSV = "cluster_step_trace_time.csv" + CLUSTER_TRACE_TIME_TABLE = "ClusterStepTraceTime" + + def __init__(self, param: dict): + self.collection_path = param.get(Constant.COLLECTION_PATH) + self.data_map = param.get(Constant.DATA_MAP) + self.communication_group = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_GROUP) + self.step_time_dict = {} + self.step_data_list = [] + self.data_type = param.get(Constant.DATA_TYPE) + + @staticmethod + def get_max_data_row(data_group_list: list): + if not data_group_list: + return [] + ret = [] + for idx in range(len(data_group_list[0])): + max_val = 0 + for idy in range(len(data_group_list)): + max_val = max(max_val, data_group_list[idy][idx]) + ret.append(max_val) + return ret + + def run(self): + self.load_step_trace_time_data() + self.analyze_step_time() + self.dump_data() + + def dump_data(self): + if not self.step_data_list: + print("[WARNING] Can't get step time info!") + return + if self.data_type == Constant.TEXT: + headers = self.get_headers() + FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers) + else: + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.CLUSTER_TRACE_TIME_TABLE) + column_len = DBManager.get_table_column_count(result_db, self.CLUSTER_TRACE_TIME_TABLE) + data_len = len(self.step_data_list[0]) + if data_len < column_len: + for data in self.step_data_list: + data.extend([0] * (column_len - data_len)) + conn, cursor = DBManager.create_connect_db(result_db) + sql = "insert into {} values ({value})".format(self.CLUSTER_TRACE_TIME_TABLE, + value="?," * (len(self.step_data_list[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, self.step_data_list) + DBManager.destroy_db_connect(conn, cursor) + + def load_step_trace_time_data(self): + for rank_id, profiling_dir_path in self.data_map.items(): + if self.data_type == Constant.TEXT: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV) + if os.path.exists(step_time_file): + self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean) + else: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, + Constant.DB_COMMUNICATION_ANALYZER) + if (os.path.exists(step_time_file) and + DBManager.check_tables_in_db(step_time_file, Constant.TABLE_STEP_TRACE)): + conn, cursor = DBManager.create_connect_db(step_time_file) + sql = "select * from {0}".format(Constant.TABLE_STEP_TRACE) + data = DBManager.fetch_all_data(cursor, sql, is_dict=False) + self.step_time_dict[rank_id] = data + DBManager.destroy_db_connect(conn, cursor) + if not self.step_time_dict.get(rank_id): + print(f"[WARNING] Rank {rank_id} does not have a valid step_trace_time data in {self.data_type} file.") + + def analyze_step_time(self): + for rank_id, data_bean_list in self.step_time_dict.items(): + for data_bean in data_bean_list: + if self.data_type == Constant.TEXT: + self.step_data_list.append([data_bean.step, Constant.RANK, rank_id] + data_bean.row) + else: + self.step_data_list.append([data_bean[0], Constant.RANK, rank_id] + list(data_bean[1:])) + stage_list = self.communication_group.get(Constant.P2P) + if not stage_list: + return + step_group_dict = {} + for data_list in self.step_data_list: + stage_group = tuple() + for stage in stage_list: + if data_list[2] in stage: + stage_group = tuple(stage) + break + key = (data_list[0], stage_group) + step_group_dict.setdefault(key, []).append(data_list[3:]) + + for key, data_group_list in step_group_dict.items(): + if self.data_type == Constant.TEXT: + self.step_data_list.append([key[0], Constant.STAGE, key[1]] + self.get_max_data_row(data_group_list)) + else: + index = "(" + ",".join(str(i) for i in key[1]) + ")" + self.step_data_list.append([key[0], Constant.STAGE, index] + self.get_max_data_row(data_group_list)) + + def get_headers(self): + if self.step_time_dict: + for rank in self.step_time_dict: + if self.step_time_dict.get(rank): + return self.step_time_dict[rank][0].all_headers + return [] diff --git a/profiler/cluster_analyse_review/cluster_analysis.py b/profiler/cluster_analyse_review/cluster_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..a8d01dcfe348be6b47c0a71099cedab64b6b3e06 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_analysis.py @@ -0,0 +1,148 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor +from cluster_data_preprocess.mindspore_data_preprocessor import MindsporeDataPreprocessor +from communication_group.communication_group_generator import CommunicationGroupGenerator +from common_func.constant import Constant +from common_func.file_manager import FileManager +from common_func.path_manager import PathManager +from common_func import analysis_loader +from analysis.analysis_facade import AnalysisFacade + +COMM_FEATURE_LIST = ['all', 'communication_time', 'communication_matrix'] +ALL_FEATURE_LIST = ['all', 'communication_time', 'communication_matrix', 'cann_api_sum', 'hccl_sum', 'compute_op_sum', + 'mstx_sum'] + + +def get_analysis_args(analysis_class, analysis_args): + parser = argparse.ArgumentParser(description="custom analysis args") + parser.add_argument("--parallel_mode", type=str, help="context mode", default="concurrent") + parser.add_argument("--export_type", type=str, help="export type", default="db") + analysis_class[1].add_parser_argument(parser) + return parser.parse_args(analysis_args) + +def parse_specific_params(analysis_name, analysis_args): + analysis_class = analysis_loader.get_class_from_name(analysis_name) + if not analysis_class: + print("[ERROR] undefined analysis.") + return None + + args_parsed = get_analysis_args(analysis_class, analysis_args) + specific_params = { + Constant.RECIPE_NAME: analysis_class[0], + Constant.RECIPE_CLASS: analysis_class[1], + Constant.PARALLEL_MODE: args_parsed.parallel_mode, + Constant.EXPORT_TYPE: args_parsed.export_type + } + specific_params.update(analysis_class[1].parse_argument(args_parsed)) + return specific_params + +class Interface: + ASCEND_PT = "ascend_pt" + ASCEND_MS = "ascend_ms" + + + def __init__(self, params: dict): + self.collection_path = PathManager.get_realpath(params.get(Constant.COLLECTION_PATH)) + self.analysis_mode = params.get(Constant.ANALYSIS_MODE) + self.data_map = {} + self.communication_group = {} + self.collective_group_dict = {} + self.communication_ops = [] + self.matrix_ops = [] + self.origin_params = params + + def allocate_prof_data(self): + ascend_pt_dirs = [] + ascend_ms_dirs = [] + for root, dirs, files in os.walk(self.collection_path): + for dir_name in dirs: + if dir_name.endswith(self.ASCEND_PT): + ascend_pt_dirs.append(os.path.join(root, dir_name)) + if dir_name.endswith(self.ASCEND_MS): + ascend_ms_dirs.append(os.path.join(root, dir_name)) + pytorch_processor = PytorchDataPreprocessor(ascend_pt_dirs) + pt_data_map = pytorch_processor.get_data_map() + data_type = pytorch_processor.get_data_type() + ms_data_map = MindsporeDataPreprocessor(ascend_ms_dirs).get_data_map() + if pt_data_map and ms_data_map: + print("[ERROR] Can not analyze pytorch and mindspore meantime.") + return [] + return (pt_data_map, data_type) if pt_data_map else (ms_data_map, Constant.TEXT) + + def run(self): + PathManager.check_input_directory_path(self.collection_path) + PathManager.check_path_owner_consistent(self.collection_path) + data_map, data_type = self.allocate_prof_data() + if not data_map: + print("[WARNING] Can not get rank info or profiling data.") + return + if data_type == Constant.INVALID: + print("[ERROR] The current folder contains both DB and other files. Please check.") + return + if self.analysis_mode not in COMM_FEATURE_LIST: + if data_type != Constant.DB: + print("[ERROR] The current analysis node only supports DB as input data. Please check.") + return + FileManager.create_output_dir(self.collection_path, is_overwrite=True) + params = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.DATA_MAP: data_map, + Constant.DATA_TYPE: data_type, + Constant.RECIPE_NAME: self.origin_params.get(Constant.RECIPE_NAME, ""), + Constant.RECIPE_CLASS: self.origin_params.get(Constant.RECIPE_CLASS), + Constant.PARALLEL_MODE: self.origin_params.get(Constant.PARALLEL_MODE, ""), + Constant.EXPORT_TYPE: self.origin_params.get(Constant.EXPORT_TYPE, "") + } + params.update(params[Constant.RECIPE_CLASS].get_extra_argument(self.origin_params)) + AnalysisFacade(params).recipe_analyze() + else: + FileManager.create_output_dir(self.collection_path) + params = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.DATA_MAP: data_map, + Constant.ANALYSIS_MODE: self.analysis_mode, + Constant.DATA_TYPE: data_type + } + comm_data_dict = CommunicationGroupGenerator(params).generate() + params[Constant.COMM_DATA_DICT] = comm_data_dict + AnalysisFacade(params).cluster_analyze() + + +def cluster_analysis_main(args=None): + parser = argparse.ArgumentParser(description="cluster analysis module") + parser.add_argument('-d', '--collection_path', type=str, required=True, help="profiling data path") + parser.add_argument('-m', '--mode', choices=ALL_FEATURE_LIST, + default='all', help="different analysis mode") + args_parsed, args_remained = parser.parse_known_args(args=args) + parameter = { + Constant.COLLECTION_PATH: args_parsed.collection_path, + Constant.ANALYSIS_MODE: args_parsed.mode + } + if args_parsed.mode in COMM_FEATURE_LIST: + if args_remained: + print(f"[ERROR] The specific argument {args_remained} is not supported for communication analysis.") + return + else: + parameter.update(parse_specific_params(args_parsed.mode, args_remained)) + Interface(parameter).run() + + +if __name__ == "__main__": + cluster_analysis_main() diff --git a/profiler/cluster_analyse_review/cluster_data_preprocess/__init__.py b/profiler/cluster_analyse_review/cluster_data_preprocess/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_data_preprocess/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/cluster_data_preprocess/data_preprocessor.py b/profiler/cluster_analyse_review/cluster_data_preprocess/data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..72d65ae6571e68564e46f43463843d1f46a3a69e --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_data_preprocess/data_preprocessor.py @@ -0,0 +1,41 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from abc import abstractmethod + + +class DataPreprocessor: + PROFILER_INFO_HEAD = 'profiler_info_' + PROFILER_INFO_EXTENSION = '.json' + + def __init__(self, path_list: list): + self.path_list = path_list + self.data_map = {} + + @abstractmethod + def get_data_map(self): + pass + + def get_rank_id(self, dir_name: str) -> int: + files = os.listdir(dir_name) + for file_name in files: + if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): + rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)] + try: + rank_id = int(rank_id_str) + except ValueError: + rank_id = -1 + return rank_id + return -1 diff --git a/profiler/cluster_analyse_review/cluster_data_preprocess/mindspore_data_preprocessor.py b/profiler/cluster_analyse_review/cluster_data_preprocess/mindspore_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..a3e09983ddb54b972a9e343c1661b5c8b2cbb8c8 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_data_preprocess/mindspore_data_preprocessor.py @@ -0,0 +1,41 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + +from cluster_data_preprocess.data_preprocessor import DataPreprocessor + + +class MindsporeDataPreprocessor(DataPreprocessor): + + def __init__(self, path_list: list): + super().__init__(path_list) + + def get_data_map(self) -> dict: + rank_id_map = defaultdict(list) + for dir_name in self.path_list: + rank_id = self.get_rank_id(dir_name) + if rank_id < 0: + print('[Error]fail to get rankid or rankid invalid.') + continue + rank_id_map[rank_id].append(dir_name) + + try: + for (rank_id, dir_list) in rank_id_map.items(): + dir_list.sort(key=lambda x: x.split('_')[-3]) + self.data_map[rank_id] = dir_list[0] + except Exception as e: + raise RuntimeError("Found invalid directory name!") from e + return self.data_map diff --git a/profiler/cluster_analyse_review/cluster_data_preprocess/pytorch_data_preprocessor.py b/profiler/cluster_analyse_review/cluster_data_preprocess/pytorch_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..55c3d03958b97c427fe8fde0625e72ea4dee8997 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_data_preprocess/pytorch_data_preprocessor.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import glob +from collections import defaultdict +import os + +from cluster_data_preprocess.data_preprocessor import DataPreprocessor +from common_func.constant import Constant +from common_func.file_manager import FileManager + + +class PytorchDataPreprocessor(DataPreprocessor): + + def __init__(self, path_list: list): + super().__init__(path_list) + self.data_type = set() + + def get_data_map(self) -> dict: + rank_id_map = defaultdict(list) + for dir_name in self.path_list: + rank_id = self.get_rank_id(dir_name) + if rank_id < 0: + print('[Error]fail to get rankid or rankid invalid.') + continue + for file_name in os.listdir(dir_name): + if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): + file_path = os.path.join(dir_name, file_name) + config = FileManager.read_json_file(file_path) + self.data_type.add(config.get(Constant.CONFIG, {}).get(Constant.EXPER_CONFIG, {}). + get(Constant.EXPORT_TYPE, Constant.TEXT)) + rank_id_map[rank_id].append(dir_name) + + try: + for (rank_id, dir_list) in rank_id_map.items(): + dir_list.sort(key=lambda x: x.split('_')[-3]) + self.data_map[rank_id] = dir_list[0] + except Exception as e: + raise RuntimeError("Found invalid directory name!") from e + return self.data_map + + def get_data_type(self): + if len(self.data_type) == 1: + return self.data_type.pop() + return Constant.INVALID diff --git a/profiler/cluster_analyse_review/cluster_kernels_analysis/README.md b/profiler/cluster_analyse_review/cluster_kernels_analysis/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f90f99fb9b3058d5ad67728b45da1c07f03e65e5 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_kernels_analysis/README.md @@ -0,0 +1,67 @@ +# 功能介绍 +集群场景下,多卡间的算子情况,只能通过查看每张卡各自的性能数据来了解,不能直观的对比各卡之间算子的性能差异。 +cluster_op_summary_analysis.py脚本基于多卡性能数据的op_summary信息,统计并展示各卡中执行最快、最慢、均值和方差的TopN算子。 + +## 交附件 +### cluster_op_time_ analysis.csv +将算子以op_name、input_shape、input_size、output_shape进行分类,统计每一类算子,在不同节点(node)的不同卡(device)上,执行时间的最大、最小、方差、平均时间以及范围。 +### xxx_info.html + +主要是各个特性(time和ratio)的html文件,以html方式展示top_n算子的箱线图。 + +time和ratio表示AI Core和AI Vector Core算子性能指标中的耗时和占比字段。 + +以html文件展示TopN算子执行耗时和占比的箱线图。 + +有TopN个算子就会有TopN个坐标系,每个坐标系表示一个算子的特性,以total_time的平均值从左向右依次向下排序。 + +- 横坐标:node_device表示第几个node的第几张卡,从小到大排序。 +- 纵坐标:时间。 +- 坐标名:在坐标下方,以op_name-input_shape拼接展示。 + +# 操作指导 + +1. 准备性能数据 + + 拷贝所有node上的性能数据到一个环境里,性能数据必须包含在node*目录下,例如当前集群场景为2机16卡,那么就是两个node分别有八个device,拷贝性能数据目录如下: + + ```bash + ├── node0 # 可以是node0或nodeo_xxx,表示某个节点 + │ ├── PROF_XXXXX # 单个device的性能数据,须完成msprof性能数据解析 + │ ├── SUMMARY + │ ├── op_summary_XX.csv + | ...... # 一共八张卡的性能数据 + ├── node1 # 可以是node1 或者node1_xxx表示某个节点 + │ ├── PROF_XXXXX # 单个device的profiling数据 + │ ├── SUMMARY + │ ├── op_summary_XX.csv # 用来做解析的op_summary表格 + | ...... + ``` + +2. 拷贝脚本准备环境 + + 将cluster_prof_Info_analysis.py脚本拷贝到一个文件夹里,并安装对应的Python库。 + + ```bash + pip install pandas + pip install ploty + ``` + +3. 运行脚本 + + ```bash + python3 cluster_prof_Info_analysis.py –d data_path -t type -n top_n + ``` + + - -d:集群场景性能数据目录,输入node的上一级目录。 + - -t:获取分析信息结果文件类型,可取值:html、csv、all,默认html。 + - -n:html分析独有,表示需要展示的是平均时间top_n的算子,默认10,配置超过30时需要一定时间。 + +异常情况处理: + +- -n参数必须大于0,如果输入<=0, 默认只导出一个算子的数据。 +- 配置-n参数值大于算子总数时,按等于算子数处理。 +- 部分没有op_summary的,不显示也不报错。 +- 目录下不存在op_summary时,执行报错无法找到数据文件。 +- op_summary列数据错误或读不到数据时,提示具体出错文件。 +- -t参数配置错误时,提示输入错误,并提示正确的配置。 diff --git a/profiler/cluster_analyse_review/cluster_kernels_analysis/__init__.py b/profiler/cluster_analyse_review/cluster_kernels_analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/cluster_analyse_review/cluster_kernels_analysis/cluster_prof_Info_analysis.py b/profiler/cluster_analyse_review/cluster_kernels_analysis/cluster_prof_Info_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..27e3c229c56d7c2a1afe6ae49d98c96b19bc55ff --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_kernels_analysis/cluster_prof_Info_analysis.py @@ -0,0 +1,327 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import argparse +import re +import os +import stat +import shutil +import warnings +from pathlib import Path + +import pandas as pd +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from plotly.offline import plot + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from common_func.path_manager import PathManager + + +MAX_READ_FILE_BYTES = 64 * 1024 * 1024 + + +class FormDataProcessor: + def __init__(self, path, form_name): + self.form_name = form_name + self.files = self.get_files_with_prefix_recursive(path, form_name) + + def get_files_with_prefix_recursive(self, csv_path, match_str): + matched_ir_files = list(Path(csv_path).rglob(match_str)) + if not matched_ir_files: + msg = f"Didn't find any file in folder {csv_path} that matches {match_str}" + raise RuntimeError(msg) + return [str(item) for item in matched_ir_files] + + def readSummaryData(self, columns_to_keep): + # 存储所有合并后的数据 + all_data = pd.DataFrame() + for f in self.files: + if "mindstudio_profiler_output" in f: + continue + # 判断csv文件大小 + PathManager.check_path_readable(f) + # 读取CSV文件 + df = pd.read_csv(f) + # 保留需要的列 + try: + df = df[columns_to_keep] + except KeyError: + print(f"{f}文件没有所需的列,请确认profiling数据的正确性:\n,以下列可能不存在{columns_to_keep}\n") + continue + # 从文件名提取设备ID + try: + df['device_id'] = self.getDeviceId(f) + except Exception: + print(f"文件 \"{f}\" 的路径或者是文件夹名没有按照要求,请确保存在[device_]这一级文件夹,具体操作指导见readme\n") + continue + # 添加新列 "device_id" + try: + df['node_id'] = self.getNodeId(f) + except Exception: + print(f"文件 \"{f}\" 的路径或者是文件夹名没有按照要求,请确保存在[node*]这一级文件夹,具体操作指导见readme\n") + continue + # 将数据添加到最终的数据框中 + all_data = pd.concat([all_data, df]) + return all_data + + def getChipType(self): + file = self.files[0] + df = pd.read_csv(file) + if 'aiv_time(us)' in df.columns: + return "ASCEND_NEW" + return "ASCEND_OTHER" + + def getDeviceId(self, dir_path): + device_id = re.search(r'device_(\d+)', dir_path).group(1) + return device_id + + def getNodeId(self, dir_path): + node_id = re.search(r'node(\d+)', dir_path).group(1) + return int(node_id) + + def getRankNum(self): + return len(self.files) + + +# 表驱动,获取不同芯片类型不同交付件的所需的列 +class ViewInfoManager: + def __init__(self, chip_type): + self.chip_type = chip_type + self.op_summary_columns_dict = {} + self.setOpSummaryColumnsParams() + + def setOpSummaryColumnsParams(self): + # 有些数据除了用表格的列进行分组之外,还添加了其他属性对数据进行分类,这部分数据放在extend_attr_to_group里面 + self.op_summary_columns_dict = { + 'ASCEND_NEW': { + 'TimeToCsvAnalyzer': + {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], + 'extend_attr_to_group': ["device_id", "node_id"], + 'columns_to_view': ["Task Duration(us)"], + 'calculate_fun': ['mean', 'var', 'max', 'min'] + }, + 'StatisticalInfoToHtmlAnalyzer': + {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], + "columns_to_view": ["Task Duration(us)", "aiv_time(us)", "aiv_vec_ratio", + "aiv_scalar_ratio", "aiv_mte2_ratio", "aiv_mte3_ratio", + "aicore_time(us)", "aic_mac_ratio", "aic_scalar_ratio", + "aic_mte1_ratio", "aic_mte2_ratio", "aic_fixpipe_ratio" + ], + 'calculate_fun': ['mean', 'var', 'max', 'min'] + } + }, + 'ASCEND_OTHER': { + 'TimeToCsvAnalyzer': + {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], + 'extend_attr_to_group': ["device_id", "node_id"], + "columns_to_view": ["Task Duration(us)"], + 'calculate_fun': ['mean', 'var', 'max', 'min'] + }, + 'StatisticalInfoToHtmlAnalyzer': + {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], + "columns_to_view": ["aicore_time(us)", "Task Duration(us)", "mac_ratio", "vec_ratio", + "scalar_ratio", "mte1_ratio", "mte2_ratio", "mte3_ratio"], + 'calculate_fun': ['mean', 'var', 'max', 'min'] + } + } + } + + def getColumnsInfo(self, analyzer_type): + return self.op_summary_columns_dict.get(self.chip_type, {}).get(analyzer_type) + + +class OpSummaryAnalyzerBase: + def __init__(self, chip_type, analyzer_type, dir_path): + self.chip_type = chip_type + view_info = ViewInfoManager(chip_type).getColumnsInfo(analyzer_type) + self.columns_to_view = view_info['columns_to_view'] + self.calculate_fun = view_info['calculate_fun'] + self.columns_to_group = view_info['columns_to_group'] + self.attrs_to_group = self.columns_to_group.copy() + if 'extend_attr_to_group' in view_info: + extend_attr_to_group = view_info['extend_attr_to_group'] + self.attrs_to_group.extend(extend_attr_to_group) + # 创建结果文件 + self.result_dir = os.path.join(dir_path, "result") + PathManager.check_path_length(self.result_dir) + if os.path.exists(self.result_dir): + shutil.rmtree(self.result_dir, onerror=self.on_rm_error) + PathManager.check_path_writeable(dir_path) + PathManager.make_dir_safety(self.result_dir) + + def getColumnsToGroup(self): + return self.columns_to_group + + def getColumnsToView(self): + return self.columns_to_view + + def calculateViewData(self, summary_data): + # 存储所有合并后的数据 + calculate_dict = {self.columns_to_view[i]: self.calculate_fun for i in range(len(self.columns_to_view))} + view_data = summary_data.groupby(self.attrs_to_group).agg(calculate_dict).reset_index() + return view_data + + def on_rm_error(self, func, path, exc_info): + # path contains the path of the file that couldn't be removed + # let's just assume that it's read-only and unlink it. + os.chmod(path, stat.S_IWRITE) + os.unlink(path) + + +class TimeToCsvAnalyzer(OpSummaryAnalyzerBase): + def __init__(self, chip_type, dir_path): + super().__init__(chip_type, "TimeToCsvAnalyzer", dir_path) + + def GenerateDeliverable(self, summary_data, rank_num): + view_data = self.calculateViewData(summary_data) + # 规范化列名 + view_data.columns = [''.join(col) if col[1] == "" else '_'.join(col) for col in view_data.columns] + try: + for column in self.columns_to_view: + view_data[column + '_range'] = view_data[column + '_max'] - view_data[column + '_min'] + except Exception as e: + raise RuntimeError("Invalid view data!") from e + save_path = os.path.join(self.result_dir, "cluster_duration_time_analysis.csv") + PathManager.check_path_length(save_path) + view_data.to_csv(save_path, index=False) + # 该文件权限设置为只读权限,不允许修改 + os.chmod(save_path, stat.S_IROTH) + return view_data + + +class StatisticalInfoToHtmlAnalyzer(OpSummaryAnalyzerBase): + def __init__(self, chip_type, top_n, dir_path): + super().__init__(chip_type, "StatisticalInfoToHtmlAnalyzer", dir_path) + self.top_n = top_n + # top_n 如果不符合要求,报警告 + + def GenerateDeliverable(self, summary_data, rank_num): + view_data = self.calculateViewData(summary_data) + # 规范化列名 op_name/ --> op_name time/var 这种不变 + view_data.columns = [''.join(col) if col[1] == "" else col for col in view_data.columns] + + # 对使用到的变量进行初始设置 + self.top_n = min(max(self.top_n, 1), len(view_data)) + top_n_data = view_data.sort_values(("Task Duration(us)", 'var'), ascending=False).head(self.top_n) + + for column in self.columns_to_view: + # 分别给每一种特性画图 + self.drawPloty(column, summary_data, top_n_data, rank_num) + + def drawPloty(self, column, summary_data, top_n_data, rank_num): + col_num = self.getCalNum(rank_num) + row_num = self.top_n // col_num if self.top_n % col_num == 0 else (self.top_n + 1) // col_num + fig = make_subplots(rows=row_num, cols=col_num, vertical_spacing=0.03) + for i, (_, operation) in enumerate(top_n_data.iterrows()): + op_data = summary_data[(summary_data["Op Name"] == operation["Op Name"]) & + (summary_data["Input Shapes"] == operation["Input Shapes"]) & + (summary_data["Input Data Types"] == operation["Input Data Types"])] + op_data = op_data.sort_values(by=["node_id", "device_id"]) + node_ids = op_data['node_id'].unique() + device_ids = op_data['device_id'].unique() + + for node_id in node_ids: + for device_id in device_ids: + draw_data = op_data[(op_data['node_id'] == node_id) & (op_data['device_id'] == device_id)] + fig.add_trace(go.Box(y=draw_data[column], + name=f'{node_id}_{device_id}', + marker_color='green', showlegend=False), (i // col_num) + 1, (i % col_num) + 1) + + fig.update_xaxes(title_text=f'{operation["Op Name"]}-{operation["Input Shapes"]}', row=(i // col_num) + 1, + col=(i % col_num) + 1) + fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), + height=int(500 * row_num), + width=int(rank_num * 100 * col_num), + title_text="Op Performance Comparison") + save_plot_path = os.path.join(self.result_dir, column + "_Info.html") + PathManager.check_path_length(save_plot_path) + plot(fig, filename=save_plot_path) + # 该文件权限设置为只读权限,不允许修改 + os.chmod(save_plot_path, stat.S_IROTH) + + def getCalNum(self, rank_num): + # 计算每行应该画多少个子图 + if rank_num <= 16: + return 2 + else: + return 1 + + +class DeliverableGenerator: + def __init__(self, params): + self.dirs = params.get('dir') + self.formProcess = FormDataProcessor(self.dirs, 'op_summary*.csv') + self.analyzers = [] + self.columns_to_keep = [] + self.setAnalyzers(params) + self.setColumnsToKeep() + + def run(self): + summary_data = self.formProcess.readSummaryData(self.columns_to_keep) + # 判断summarydata 数据是否为空,如果是空, 说明所有csv读取数据都失败了 + if summary_data.empty: + print("没有符合要求的csv表格数据,请排查您的PROFILING数据") + return + rank_num = self.formProcess.getRankNum() + for analyzer in self.analyzers: + analyzer.GenerateDeliverable(summary_data, rank_num) + + def setAnalyzers(self, params): + chip_type = self.formProcess.getChipType() + # 判断该路径是不是软链接,并修改为绝对路径 + if os.path.islink(params.get('dir')): + print(f"The file: \"{params.get('dir')}\" is link. Please check the path.") + return + prof_path = os.path.realpath(params.get('dir')) + PathManager.input_path_common_check(prof_path) + if params.get('type') == "all": + self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path), StatisticalInfoToHtmlAnalyzer(chip_type, params.get("top_n"), prof_path)] + elif params.get('type') == "html": + self.analyzers = [StatisticalInfoToHtmlAnalyzer(chip_type, params.get("top_n"), prof_path)] + elif params.get('type') == "csv": + self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path)] + else: + warnings.warn("参数错误,请输入 all html csv 这三种类型") # 发出一个警告信息 + + + def setColumnsToKeep(self): + columns_to_keep = [] + for analyzer in self.analyzers: + columns_to_keep.extend(analyzer.getColumnsToGroup()) + columns_to_keep.extend(analyzer.getColumnsToView()) + self.columns_to_keep = list(set(columns_to_keep)) + + +def main(): + # 解析命令行参数 + parser = argparse.ArgumentParser() + parser.add_argument("--dir", "-d", default=None, help="root dir of PROF_* data") + parser.add_argument("--top_n", "-n", default=10, help="how many operators to show", type=int) + parser.add_argument("--type", "-t", default='html', help="compare ratio or aicore-time", type=str) + args = parser.parse_args() + params = { + "dir": args.dir, + "top_n": args.top_n, + "type": args.type + } + + deviverable_gen = DeliverableGenerator(params) + deviverable_gen.run() + +if __name__ == "__main__": + main() diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/__init__.py b/profiler/cluster_analyse_review/cluster_statistics_export/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/cann_api_sum_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/cann_api_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..578ee937be57ff8615085bbe1e4ac6ccae81a4e9 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/cann_api_sum_export.py @@ -0,0 +1,65 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + +QUERY = """ +WITH + summary as ( + SELECT + name, + sum(endNs - startNs) AS duration, + count (*) AS num, + avg(endNs - startNs) AS avg_duration, + min(endNs - startNs) AS min_duration, + median(endNs - startNs) AS med_duration, + max(endNs - startNs) AS max_duration, + stdev(endNs - startNs) AS stdev_duration, + lower_quartile(endNs - startNs) AS lower_quartile_duration, + upper_quartile(endNs - startNs) AS upper_quartile_duration + FROM + CANN_API + GROUP BY name + ), + totals AS ( + SELECT sum(duration) AS total + FROM summary + ) +SELECT + ids.value AS "name", + round(summary.duration * 100.0 / (SELECT total FROM totals), 2) AS "durationRatio", + summary.duration AS "totalTimeNs", + summary.num AS "totalCount", + round(summary.avg_duration, 1) AS "averageNs", + round(summary.min_duration, 1) AS "minNs", + round(summary.lower_quartile_duration, 1) AS "Q1Ns", + round(summary.med_duration, 1) AS "medNs", + round(summary.upper_quartile_duration, 1) AS "Q3Ns", + round(summary.max_duration, 1) AS "maxNs", + round(summary.stdev_duration, 1) AS "stdev" +FROM + summary +LEFT JOIN + STRING_IDS AS ids + ON ids.id == summary.name +ORDER BY 2 DESC; + """ + + +class CannApiSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/compute_op_sum_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/compute_op_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..d70c696100bc305f8b1e182f7b1f915cf58f274a --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/compute_op_sum_export.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + NAME_IDS.value AS "OpName", + OPTYPE_IDS.value AS "OpType", + TASKTYPE_IDS.value AS "TaskType", + INPUTSHAPES_IDS.value AS "InputShapes", + round(TASK.endNs - TASK.startNs) AS "Duration" +FROM + COMPUTE_TASK_INFO +LEFT JOIN TASK + ON TASK.globalTaskId == COMPUTE_TASK_INFO.globalTaskId +LEFT JOIN + STRING_IDS AS NAME_IDS + ON NAME_IDS.id == COMPUTE_TASK_INFO.name +LEFT JOIN + STRING_IDS AS OPTYPE_IDS + ON OPTYPE_IDS.id == COMPUTE_TASK_INFO.opType +LEFT JOIN + STRING_IDS AS TASKTYPE_IDS + ON TASKTYPE_IDS.id == COMPUTE_TASK_INFO.taskType +LEFT JOIN + STRING_IDS AS INPUTSHAPES_IDS + ON INPUTSHAPES_IDS.id == COMPUTE_TASK_INFO.inputShapes + """ + + +class ComputeOpSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/hccl_sum_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/hccl_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..f695949de1a92e9a1faff593bc45e52f91582242 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/hccl_sum_export.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + NAME_IDS.value AS "OpName", + TYPE_IDS.value AS "OpType", + round(endNs - startNs) AS "Duration" +FROM + COMMUNICATION_OP +LEFT JOIN + STRING_IDS AS TYPE_IDS + ON TYPE_IDS.id == COMMUNICATION_OP.opType +LEFT JOIN + STRING_IDS AS NAME_IDS + ON NAME_IDS.id == COMMUNICATION_OP.opName + """ + + +class HcclSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/mstx_mark_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/mstx_mark_export.py new file mode 100644 index 0000000000000000000000000000000000000000..ac5355c020042d474963296242b79eb3fd6a8c38 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/mstx_mark_export.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +WITH + FRAMEWORK_API AS ( + SELECT + PYTORCH_API.startNs, + CONNECTION_IDS.connectionId + FROM + PYTORCH_API + LEFT JOIN + CONNECTION_IDS + ON PYTORCH_API.connectionId == CONNECTION_IDS.id + ) +SELECT + MSG_IDS.value AS "msg", + MSTX_EVENTS.startNs AS "cann_ts", + TASK.startNs AS "device_ts", + FRAMEWORK_API.startNs AS "framework_ts", + MSTX_EVENTS.globalTid AS "tid" +FROM + MSTX_EVENTS +LEFT JOIN + TASK + ON MSTX_EVENTS.connectionId == TASK.connectionId +LEFT JOIN + FRAMEWORK_API + ON MSTX_EVENTS.connectionId == FRAMEWORK_API.connectionId +LEFT JOIN + STRING_IDS AS MSG_IDS + ON MSTX_EVENTS.message == MSG_IDS.id +ORDER BY + MSTX_EVENTS.startNs + """ + + +class MstxMarkExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/mstx_step_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/mstx_step_export.py new file mode 100644 index 0000000000000000000000000000000000000000..c257ce675fe46ea0f7eff2489dd2fe13c846564f --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/mstx_step_export.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + id AS "step_id", + startNs AS "start_ns", + endNs AS "end_ns" +FROM + STEP_TIME +ORDER BY + startNs + """ + + +class MstxStepExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/stats_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/stats_export.py new file mode 100644 index 0000000000000000000000000000000000000000..e6d98f48ef8c4e8032f7611dac163ead3cc5fbe0 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/stats_export.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from common_func.db_manager import DBManager +from common_func.constant import Constant + + +class StatsExport: + + def __init__(self, db_path, analysis_class): + self._db_path = db_path + self._analysis_class = analysis_class + self._query = None + + def get_query(self): + return self._query + + def read_export_db(self): + query = self.get_query() + if query is None: + print(f"[ERROR] query is None.") + return + conn, cursor = DBManager.create_connect_db(self._db_path, Constant.ANALYSIS) + data = pd.read_sql(query, conn) + DBManager.destroy_db_connect(conn, cursor) + return data diff --git a/profiler/cluster_analyse_review/cluster_utils/__init__.py b/profiler/cluster_analyse_review/cluster_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/cluster_analyse_review/cluster_utils/data_transfer_adapter.py b/profiler/cluster_analyse_review/cluster_utils/data_transfer_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..1f306415fa789ae0dab7d8751b1c240b3433de0d --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_utils/data_transfer_adapter.py @@ -0,0 +1,142 @@ +import copy + +from common_func.constant import Constant +from common_func.table_constant import TableConstant + + +class DataTransferAdapter(object): + COMM_TIME_TABLE_COLUMN = [TableConstant.START_TIMESTAMP, TableConstant.ELAPSED_TIME, TableConstant.TRANSIT_TIME, + TableConstant.WAIT_TIME, TableConstant.SYNCHRONIZATION_TIME, TableConstant.IDLE_TIME, + TableConstant.SYNCHRONIZATION_TIME_RATIO, TableConstant.WAIT_TIME_RATIO] + COMM_TIME_JSON_COLUMN = [Constant.START_TIMESTAMP, Constant.ELAPSE_TIME_MS, Constant.TRANSIT_TIME_MS, + Constant.WAIT_TIME_MS, Constant.SYNCHRONIZATION_TIME_MS, Constant.IDLE_TIME_MS, + Constant.SYNCHRONIZATION_TIME_RATIO, Constant.WAIT_TIME_RATIO] + MATRIX_TABLE_COLUMN = [TableConstant.TRANSIT_SIZE, TableConstant.TRANSIT_TIME, TableConstant.BANDWIDTH, + TableConstant.TRANSPORT_TYPE, TableConstant.OPNAME] + MATRIX_JSON_COLUMN = [Constant.TRANSIT_SIZE_MB, Constant.TRANSIT_TIME_MS, Constant.BANDWIDTH_GB_S, + Constant.TRANSPORT_TYPE, Constant.OP_NAME] + COMM_BD_TABLE_COLUMN = [TableConstant.TRANSIT_SIZE, TableConstant.TRANSIT_TIME, TableConstant.BANDWIDTH, + TableConstant.LARGE_PACKET_RATIO] + COMM_BD_JSON_COLUMN = [Constant.TRANSIT_SIZE_MB, Constant.TRANSIT_TIME_MS, Constant.BANDWIDTH_GB_S, + Constant.LARGE_PACKET_RATIO] + + def __init__(self): + super().__init__() + + def transfer_comm_from_db_to_json(self, time_info: list, bandwidth_info: list): + result = {} + if not time_info and not bandwidth_info: + return result + for time_data in time_info: + comm_time = dict() + hccl_name = time_data[TableConstant.HCCL_OP_NAME] + "@" + time_data[TableConstant.GROUP_NAME] + for key, value in dict(zip(self.COMM_TIME_JSON_COLUMN, self.COMM_TIME_TABLE_COLUMN)).items(): + if not key.endswith("ratio"): + comm_time[key] = time_data.get(value, 0) + result.setdefault(time_data[TableConstant.STEP], {}).setdefault(time_data[TableConstant.TYPE], {}). \ + setdefault(hccl_name, {})[Constant.COMMUNICATION_TIME_INFO] = comm_time + hccl_set = set() + for bd_data in bandwidth_info: + hccl_name = bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME] + hccl_set.add(hccl_name) + for hccl in hccl_set: + comm_bd = dict() + for bd_data in bandwidth_info: + if hccl == (bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]): + temp_dict = dict() + key_dict = dict(zip(self.COMM_BD_JSON_COLUMN, self.COMM_BD_TABLE_COLUMN)) + self.set_value_by_key(temp_dict, bd_data, key_dict) + comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], temp_dict).setdefault( + Constant.SIZE_DISTRIBUTION, {})[bd_data[TableConstant.PACKAGE_SIZE]] = \ + [bd_data[TableConstant.COUNT], bd_data[TableConstant.TOTAL_DURATION]] + result.setdefault(bd_data[TableConstant.STEP], {}).setdefault(bd_data[TableConstant.TYPE], {}). \ + setdefault(hccl, {})[Constant.COMMUNICATION_BANDWIDTH_INFO] = comm_bd + return result + + def transfer_comm_from_json_to_db(self, res_data: dict): + res_comm_data, res_bd_data = list(), list() + + def split_comm_time(): + for rank_id, comm_data in op_data.items(): + time_data = comm_data.get(Constant.COMMUNICATION_TIME_INFO) + res_time = set_only_value(rank_id) + for key, value in dict(zip(self.COMM_TIME_TABLE_COLUMN, self.COMM_TIME_JSON_COLUMN)).items(): + res_time[key] = time_data.get(value, 0) + res_comm_data.append(res_time) + bd_data = comm_data.get(Constant.COMMUNICATION_BANDWIDTH_INFO, {}) + for transport_type, data in bd_data.items(): + res_bandwidth = set_only_value(rank_id) + key_dict = dict(zip(self.COMM_BD_TABLE_COLUMN, self.COMM_BD_JSON_COLUMN)) + res_bandwidth[TableConstant.TRANSPORT_TYPE] = transport_type + self.set_value_by_key(res_bandwidth, data, key_dict) + for key, value in data.get(Constant.SIZE_DISTRIBUTION, {}).items(): + res_bandwidth[TableConstant.PACKAGE_SIZE] = key + res_bandwidth[TableConstant.COUNT] = value[0] + res_bandwidth[TableConstant.TOTAL_DURATION] = value[1] + temp_dict = copy.deepcopy(res_bandwidth) + res_bd_data.append(temp_dict) + + def set_only_value(rank_id): + res_dict = dict() + res_dict[TableConstant.RANK_SET] = str(rank_set) + res_dict[TableConstant.STEP] = step + res_dict[TableConstant.RANK_ID] = rank_id + res_dict[TableConstant.HCCL_OP_NAME] = op_name.split("@")[0] if "@" in op_name else op_name + res_dict[TableConstant.GROUP_NAME] = op_name.split("@")[1] if "@" in op_name else "" + return res_dict + + for rank_set, step_dict in res_data.items(): + for step, op_dict in step_dict.items(): + for op_name, op_data in op_dict.items(): + split_comm_time() + return res_comm_data, res_bd_data + + def set_value_by_key(self, src_dict, dst_dict, key_dict): + for key, value in key_dict.items(): + src_dict[key] = dst_dict.get(value, 0) + + def transfer_matrix_from_db_to_json(self, matrix_data: list): + result = {} + if not matrix_data: + return result + hccl_set = set() + for data in matrix_data: + hccl = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME] + hccl_set.add(hccl) + for hccl in hccl_set: + for data in matrix_data: + if hccl == (data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]): + key = data[TableConstant.SRC_RANK] + '-' + data[TableConstant.DST_RANK] + temp_dict = dict() + key_dict = dict(zip(self.MATRIX_JSON_COLUMN, self.MATRIX_TABLE_COLUMN)) + self.set_value_by_key(temp_dict, data, key_dict) + result.setdefault(data[TableConstant.STEP], {}).setdefault(data[TableConstant.TYPE], {}). \ + setdefault(hccl, {}).setdefault(key, temp_dict) + return result + + def transfer_matrix_from_json_to_db(self, res_data: dict): + result = list() + + def split_matrix_data(): + for op_name, op_data in op_dict.items(): + for link_key, link_data in op_data.items(): + if "@" in op_name: + hccl_op_name, group_name = op_name.split("@")[0], op_name.split("@")[1] + else: + hccl_op_name, group_name = op_name, "" + matrix_data = { + TableConstant.RANK_SET: str(rank_set), + TableConstant.STEP: step, + TableConstant.HCCL_OP_NAME: hccl_op_name, + TableConstant.GROUP_NAME: group_name, + TableConstant.SRC_RANK: link_key.split("-")[0], + TableConstant.DST_RANK: link_key.split("-")[1] + } + key_dict = dict(zip(self.MATRIX_TABLE_COLUMN, self.MATRIX_JSON_COLUMN)) + self.set_value_by_key(matrix_data, link_data, key_dict) + result.append(matrix_data) + + for rank_set, step_dict in res_data.items(): + for step, op_dict in step_dict.items(): + split_matrix_data() + return result diff --git a/profiler/cluster_analyse_review/common_func/__init__.py b/profiler/cluster_analyse_review/common_func/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/common_func/analysis_loader.py b/profiler/cluster_analyse_review/common_func/analysis_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..55e7dbc6ea930de7a47799384ffad5daa1328da2 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/analysis_loader.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect +import sys + +from common_func.constant import Constant +from analysis.base_analysis import BaseRecipeAnalysis + +def is_analysis_class(obj): + return inspect.isclass(obj) and issubclass(obj, BaseRecipeAnalysis) and obj != BaseRecipeAnalysis + +def get_class_from_name(analysis_name : str): + sys.path.append(Constant.ANALYSIS_PATH) + analysis_path = f"analysis.{analysis_name}.{analysis_name}" + module = None + try: + module = importlib.import_module(analysis_path) + except Exception as e: + print(f"[ERROR] {analysis_path} not find:{e}") + + specific_analysis = inspect.getmembers(module, is_analysis_class) + if not specific_analysis: + print(f"[ERROR] {analysis_name} not found.") + return specific_analysis[0] diff --git a/profiler/cluster_analyse_review/common_func/constant.py b/profiler/cluster_analyse_review/common_func/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..80f0374c1d1d9a37204b9583112ce5baa4cf3e95 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/constant.py @@ -0,0 +1,118 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +class Constant(object): + # dir name + FRAMEWORK_DIR = "FRAMEWORK" + CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output" + SINGLE_OUTPUT = "ASCEND_PROFILER_OUTPUT" + COMM_JSON = "communication.json" + COMM_MATRIX_JSON = "communication_matrix.json" + STEP_TIME_CSV = "step_trace_time.csv" + KERNEL_DETAILS_CSV = "kernel_details.csv" + + # file authority + FILE_AUTHORITY = 0o640 + DIR_AUTHORITY = 0o750 + MAX_JSON_SIZE = 1024 * 1024 * 1024 * 10 + MAX_CSV_SIZE = 1024 * 1024 * 1024 * 5 + MAX_PATH_LENGTH = 4096 + MAX_READ_DB_FILE_BYTES = 1024 * 1024 * 1024 * 8 + + # communication + P2P = "p2p" + COLLECTIVE = "collective" + STEP_ID = "step_id" + RANK_ID = "rank_id" + GROUP_NAME = "group_name" + COMM_OP_TYPE = "comm_op_type" + COMM_OP_NAME = "comm_op_name" + COMM_OP_INFO = "comm_op_info" + TOTAL_OP_INFO = "Total Op Info" + COMMUNICATION_TIME_INFO = "Communication Time Info" + START_TIMESTAMP = "Start Timestamp(us)" + COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info" + HCOM_SEND = "hcom_send" + HCOM_RECEIVE = "hcom_receive" + SYNCHRONIZATION_TIME_RATIO = "Synchronization Time Ratio" + SYNCHRONIZATION_TIME_MS = "Synchronization Time(ms)" + WAIT_TIME_RATIO = "Wait Time Ratio" + TRANSIT_TIME_MS = "Transit Time(ms)" + TRANSIT_SIZE_MB = "Transit Size(MB)" + SIZE_DISTRIBUTION = "Size Distribution" + WAIT_TIME_MS = "Wait Time(ms)" + OP_NAME = "Op Name" + BANDWIDTH_GB_S = "Bandwidth(GB/s)" + COMMUNICATION = "communication.json" + ELAPSE_TIME_MS = "Elapse Time(ms)" + IDLE_TIME_MS = "Idle Time(ms)" + LARGE_PACKET_RATIO = "Large Packet Ratio" + + # params + DATA_MAP = "data_map" + COLLECTIVE_GROUP = "collective_group" + COMMUNICATION_OPS = "communication_ops" + MATRIX_OPS = "matrix_ops" + COLLECTION_PATH = "collection_path" + COMMUNICATION_GROUP = "communication_group" + TRANSPORT_TYPE = "Transport Type" + COMM_DATA_DICT = "comm_data_dict" + DATA_TYPE = "data_type" + ANALYSIS_MODE = "analysis_mode" + + # step time + RANK = "rank" + STAGE = "stage" + + # epsilon + EPS = 1e-15 + + # file suffix + JSON_SUFFIX = ".json" + CSV_SUFFIX = ".csv" + + # result files type + TEXT = "text" + DB = "db" + INVALID = "invalid" + + # db name + DB_COMMUNICATION_ANALYZER = "analysis.db" + DB_CLUSTER_COMMUNICATION_ANALYZER = "cluster_analysis.db" + + # db tables + TABLE_COMM_ANALYZER_BANDWIDTH = "CommAnalyzerBandwidth" + TABLE_COMM_ANALYZER_TIME = "CommAnalyzerTime" + TABLE_COMM_ANALYZER_MATRIX = "CommAnalyzerMatrix" + TABLE_STEP_TRACE = "StepTraceTime" + TABLE_HOST_INFO = "HostInfo" + TABLE_RANK_DEVICE_MAP = "RankDeviceMap" + + # data config key + CONFIG = "config" + EXPER_CONFIG = "experimental_config" + EXPORT_TYPE = "_export_type" + + # recipe config + ANALYSIS = "analysis" + RECIPE_NAME = "recipe_name" + RECIPE_CLASS = "recipe_class" + PARALLEL_MODE = "parallel_mode" + CLUSTER_CUSTOM_ANALYSE_PATH = os.path.abspath(os.path.dirname(__file__)) + ANALYSIS_PATH = os.path.join(CLUSTER_CUSTOM_ANALYSE_PATH, 'analysis') + + CONCURRENT_MODE = "concurrent" \ No newline at end of file diff --git a/profiler/cluster_analyse_review/common_func/context.py b/profiler/cluster_analyse_review/common_func/context.py new file mode 100644 index 0000000000000000000000000000000000000000..4e3d544d3769e0c1360790dc1a4c57ca484687b8 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/context.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from functools import partial +from concurrent import futures +from common_func.constant import Constant + + +class Context(object): + """abstract base class""" + + ctx_map = None + + @classmethod + def create_context(cls, mode=Constant.CONCURRENT_MODE): + if cls.ctx_map is None: + keys = [Constant.CONCURRENT_MODE] + values = [ConcurrentContext] + cls.ctx_map = dict(zip(keys, values)) + + if mode not in cls.ctx_map: + raise NotImplementedError("mode must be in {}".format(keys)) + + return cls.ctx_map[mode]() + + def __init__(self): + print("[INFO] context {} initialized.".format(self._mode)) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + if exc_type is not None: + print(f"[ERROR] Failed to exit context: {exc_val}") + + def launch(self, func, *args, **kwargs): + raise NotImplementedError + + def map(self, func, *iterables, **kwargs): + raise NotImplementedError + + def wait(self, waitable): + raise NotImplementedError + +class ConcurrentContext(Context): + + def __init__(self, executor=None): + self._mode = Constant.CONCURRENT_MODE + super().__init__() + self._custom = executor is None + self._executor = executor or futures.ProcessPoolExecutor(max_workers=os.cpu_count()) + + def __enter__(self): + if self._executor is None: + raise RuntimeError("executor is None") + return self + + def close(self): + if self._custom: + self._executor.shutdown(wait=True) + self._executor = None + + def launch(self, func, *args, **kwargs): + return self._executor.submit(func, *args, **kwargs).result() + + def map(self, func, *iterables, **kwargs): + partial_func = partial(func, **kwargs) + return list(self._executor.map(partial_func, *iterables)) + + def wait(self, waitable): + return waitable \ No newline at end of file diff --git a/profiler/cluster_analyse_review/common_func/db_manager.py b/profiler/cluster_analyse_review/common_func/db_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..c0d6ad89be8edd8bbb2a4ee8e0653141550b0129 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/db_manager.py @@ -0,0 +1,233 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sqlite3 + +from common_func.constant import Constant +from common_func.empty_class import EmptyClass +from common_func.file_manager import check_db_path_valid +from common_func.tables_config import TablesConfig +from common_func.sql_extention_func import SqlExtentionAggregateFunc + +class DBManager: + """ + class to manage DB operation + """ + FETCH_SIZE = 10000 + INSERT_SIZE = 10000 + MAX_ROW_COUNT = 100000000 + + @staticmethod + def create_connect_db(db_path: str, mode=None) -> tuple: + """ + create and connect database + """ + if check_db_path_valid(db_path, is_create=True): + try: + conn = sqlite3.connect(db_path) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return EmptyClass("empty conn"), EmptyClass("empty curs") + try: + if mode == Constant.ANALYSIS: + try: + for func_name, params_count, class_name in SqlExtentionAggregateFunc: + conn.create_aggregate(func_name, params_count, class_name) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + if isinstance(conn, sqlite3.Connection): + curs = conn.cursor() + os.chmod(db_path, Constant.FILE_AUTHORITY) + return conn, curs + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return EmptyClass("empty conn"), EmptyClass("empty curs") + return EmptyClass("empty conn"), EmptyClass("empty curs") + + @staticmethod + def destroy_db_connect(conn: any, curs: any) -> None: + """ + destroy db connection + """ + try: + if isinstance(curs, sqlite3.Cursor): + curs.close() + except sqlite3.Error as err: + print(f"[ERROR] {err}") + try: + if isinstance(conn, sqlite3.Connection): + conn.close() + except sqlite3.Error as err: + print(f"[ERROR] {err}") + + @staticmethod + def judge_table_exists(curs: any, table_name: str) -> any: + """ + judge table exists + """ + if not isinstance(curs, sqlite3.Cursor): + return False + try: + curs.execute("select count(*) from sqlite_master where type='table' and name=?", (table_name,)) + return curs.fetchone()[0] + except sqlite3.Error as err: + print("[ERROR] {}".format(err)) + return False + + @staticmethod + def sql_generate_table(table_map: str): + header_with_type_begin = "(" + header_with_type_end = ")" + header_with_type_list = [] + if table_map in TablesConfig.DATA: + items = TablesConfig.DATA[table_map] + for item in items: + if item[0] == "index": + header_with_type_list.append('"' + item[0] + '" ' + item[1].split(",")[0]) + else: + header_with_type_list.append(item[0] + ' ' + item[1].split(",")[0]) + header_with_type_begin += ",".join(header_with_type_list) + header_with_type_begin += header_with_type_end + return header_with_type_begin + return "" + + @classmethod + def check_tables_in_db(cls, db_path: any, *tables: any) -> bool: + if check_db_path_valid(db_path): + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return False + res = True + for table in tables: + if not cls.judge_table_exists(curs, table): + res = False + break + cls.destroy_db_connect(conn, curs) + return res + return False + + @classmethod + def create_tables(cls, db_path: any, *tables: any): + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return + for table_name in tables: + if cls.judge_table_exists(curs, table_name): + drop_sql = "drop table {0}".format(table_name) + cls.execute_sql(conn, drop_sql) + table_map = "{0}Map".format(table_name) + header_with_type = cls.sql_generate_table(table_map) + sql = "CREATE TABLE IF NOT EXISTS " + table_name + header_with_type + cls.execute_sql(conn, sql) + cls.destroy_db_connect(conn, curs) + + @classmethod + def get_table_column_count(cls, db_path: any, table: any) -> int: + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return 0 + sql = "SELECT COUNT(*) FROM pragma_table_info('{}')".format(table) + res = 0 + try: + curs.execute(sql) + res = curs.fetchone()[0] + except sqlite3.Error as err: + print("[ERROR] {}".format(err)) + finally: + cls.destroy_db_connect(conn, curs) + return res + + @staticmethod + def execute_sql(conn: any, sql: str, params: any = None) -> bool: + """ + execute sql + """ + try: + if isinstance(conn, sqlite3.Connection): + if params: + conn.cursor().execute(sql, params) + else: + conn.cursor().execute(sql) + conn.commit() + return True + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return False + print("[ERROR] conn is invalid param") + return False + + @staticmethod + def executemany_sql(conn: any, sql: str, params: any) -> bool: + """ + execute many sql once + """ + try: + if isinstance(conn, sqlite3.Connection): + conn.cursor().executemany(sql, params) + conn.commit() + return True + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return False + print("[ERROR] conn is invalid param") + return False + + @classmethod + def fetch_all_data(cls: any, curs: any, sql: str, param: tuple = None, is_dict: bool = True) -> list: + """ + fetch 10000 num of data from db each time to get all data + """ + if not isinstance(curs, sqlite3.Cursor): + return [] + data = [] + try: + if param: + res = curs.execute(sql, param) + else: + res = curs.execute(sql) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + curs.row_factory = None + return [] + try: + description = res.description + while True: + res = curs.fetchmany(cls.FETCH_SIZE) + if is_dict: + data += CustomizedDictFactory.generate_dict_from_db(res, description) + else: + data += res + if len(data) > cls.MAX_ROW_COUNT: + print("[WARRING] The records count in the table exceeds the limit!") + if len(res) < cls.FETCH_SIZE: + break + return data + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return [] + finally: + curs.row_factory = None + + +class CustomizedDictFactory: + @staticmethod + def generate_dict_from_db(data_result: any, description: any) -> any: + description_set = [i[0] for i in description] + res = [] + for data in data_result: + data_dict = dict(zip(description_set, data)) + res.append(data_dict) + return res diff --git a/profiler/cluster_analyse_review/common_func/empty_class.py b/profiler/cluster_analyse_review/common_func/empty_class.py new file mode 100644 index 0000000000000000000000000000000000000000..df100d156fa064cca4514260db0b2e843e217d09 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/empty_class.py @@ -0,0 +1,20 @@ +class EmptyClass: + + def __init__(self: any, info: str = "") -> None: + self._info = info + + @classmethod + def __bool__(cls: any) -> bool: + return False + + @classmethod + def __str__(cls: any) -> str: + return "" + + @property + def info(self: any) -> str: + return self._info + + @staticmethod + def is_empty() -> bool: + return True diff --git a/profiler/cluster_analyse_review/common_func/file_manager.py b/profiler/cluster_analyse_review/common_func/file_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..e7e2d5adca37faf5b377bcbe720fdfba84311eca --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/file_manager.py @@ -0,0 +1,131 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import csv +import json + +from common_func.constant import Constant +from common_func.path_manager import PathManager + + +class FileManager: + DATA_FILE_AUTHORITY = 0o640 + DATA_DIR_AUTHORITY = 0o750 + + @classmethod + def read_csv_file(cls, file_path: str, class_bean: any) -> list: + PathManager.check_path_readable(file_path) + base_name = os.path.basename(file_path) + file_size = os.path.getsize(file_path) + if file_size <= 0: + return [] + if file_size > Constant.MAX_CSV_SIZE: + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + result_data = [] + try: + with open(file_path, newline="") as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + result_data.append(class_bean(row)) + except Exception as e: + raise RuntimeError(f"Failed to read the file: {base_name}") from e + return result_data + + @classmethod + def read_json_file(cls, file_path: str) -> dict: + PathManager.check_path_readable(file_path) + base_name = os.path.basename(file_path) + file_size = os.path.getsize(file_path) + if file_size <= 0: + return {} + if file_size > Constant.MAX_JSON_SIZE: + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + try: + with open(file_path, "r") as json_file: + result_data = json.loads(json_file.read()) + except Exception as e: + raise RuntimeError(f"Failed to read the file: {base_name}") from e + return result_data + + @classmethod + def create_csv_file(cls, profiler_path: str, data: list, file_name: str, headers: list = None) -> None: + if not data: + return + output_path = os.path.join( + profiler_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + output_file = os.path.join(output_path, file_name) + base_name = os.path.basename(output_file) + PathManager.check_path_writeable(output_path) + try: + with os.fdopen( + os.open(output_file, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY), + 'w', newline="" + ) as file: + writer = csv.writer(file) + if headers: + writer.writerow(headers) + writer.writerows(data) + except Exception as e: + raise RuntimeError(f"Can't create file: {base_name}") from e + + @classmethod + def create_json_file(cls, profiler_path: str, data: dict, file_name: str) -> None: + if not data: + return + output_path = os.path.join(profiler_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + output_file = os.path.join(output_path, file_name) + base_name = os.path.basename(output_file) + PathManager.check_path_writeable(output_path) + try: + with os.fdopen( + os.open(output_file, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY), 'w' + ) as file: + file.write(json.dumps(data)) + except Exception as e: + raise RuntimeError(f"Can't create the file: {base_name}") from e + + @classmethod + def create_output_dir(cls, collection_path: str, is_overwrite: bool = False) -> None: + output_path = os.path.join( + collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + if is_overwrite: + if not os.path.exists(output_path): + PathManager.make_dir_safety(output_path) + return + PathManager.remove_path_safety(output_path) + PathManager.make_dir_safety(output_path) + + @classmethod + def check_file_size(cls, file_path): + suffix = os.path.splitext(file_path) + base_name = os.path.join(file_path) + if suffix == Constant.CSV_SUFFIX: + limit_size = Constant.MAX_CSV_SIZE + else: + limit_size = Constant.MAX_JSON_SIZE + file_size = os.path.getsize(file_path) + if file_size > limit_size: + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + + +def check_db_path_valid(path: str, is_create: bool = False, max_size: int = Constant.MAX_READ_DB_FILE_BYTES) -> bool: + if os.path.islink(path): + print(f'[ERROR] The db file path: {path} is link. Please check the path') + return False + if not is_create and os.path.exists(path) and os.path.getsize(path) > max_size: + print(f'[ERROR] The db file: {path} is too large to read. Please check the file') + return False + return True diff --git a/profiler/cluster_analyse_review/common_func/path_manager.py b/profiler/cluster_analyse_review/common_func/path_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..7ef7b4c345c024a0980c6ce2d91839b64c351740 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/path_manager.py @@ -0,0 +1,200 @@ +# Copyright (c) 2023 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import shutil +import platform + + +class PathManager: + MAX_PATH_LENGTH = 4096 + MAX_FILE_NAME_LENGTH = 255 + DATA_FILE_AUTHORITY = 0o640 + DATA_DIR_AUTHORITY = 0o750 + WINDOWS = "windows" + + @classmethod + def check_input_directory_path(cls, path: str): + """ + Function Description: + check whether the path is valid, some businesses can accept a path that does not exist, + so the function do not verify whether the path exists + Parameter: + path: the path to check, whether the incoming path is absolute or relative depends on the business + Exception Description: + when invalid data throw exception + """ + cls.input_path_common_check(path) + base_name = os.path.basename(path) + if os.path.isfile(path): + msg = f"Invalid input path which is a file path: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_input_file_path(cls, path: str): + """ + Function Description: + check whether the file path is valid, some businesses can accept a path that does not exist, + so the function do not verify whether the path exists + Parameter: + path: the file path to check, whether the incoming path is absolute or relative depends on the business + Exception Description: + when invalid data throw exception + """ + cls.input_path_common_check(path) + base_name = os.path.basename(path) + if os.path.isdir(path): + msg = f"Invalid input path which is a directory path: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_path_length(cls, path: str): + if len(path) > cls.MAX_PATH_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + path_split_list = path.split("/") + for path in path_split_list: + path_list = path.split("\\") + for name in path_list: + if len(name) > cls.MAX_FILE_NAME_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + + @classmethod + def input_path_common_check(cls, path: str): + if len(path) > cls.MAX_PATH_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + + if os.path.islink(path): + msg = f"Invalid input path which is a soft link." + raise RuntimeError(msg) + + if platform.system().lower() == cls.WINDOWS: + pattern = r'(\.|:|\\|/|_|-|\s|[~0-9a-zA-Z\u4e00-\u9fa5])+' + else: + pattern = r'(\.|/|_|-|\s|[~0-9a-zA-Z])+' + if not re.fullmatch(pattern, path): + msg = f"Invalid input path." + raise RuntimeError(msg) + + path_split_list = path.split("/") + for path in path_split_list: + path_list = path.split("\\") + for name in path_list: + if len(name) > cls.MAX_FILE_NAME_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + + @classmethod + def check_path_owner_consistent(cls, path: str): + """ + Function Description: + check whether the path belong to process owner + Parameter: + path: the path to check + Exception Description: + when invalid path, prompt the user + """ + base_name = os.path.basename(path) + if not os.path.exists(path): + msg = f"Invalid path: {base_name}" + raise RuntimeError(msg) + if platform.system().lower() == cls.WINDOWS: + return + if os.stat(path).st_uid != os.getuid(): + check_msg = input("The path does not belong to you, do you want to continue? [y/n]") + if check_msg.lower() != "y": + raise RuntimeError("The user choose not to continue.") + + @classmethod + def check_path_writeable(cls, path): + """ + Function Description: + check whether the path is writable + Parameter: + path: the path to check + Exception Description: + when invalid data throw exception + """ + cls.check_path_owner_consistent(path) + if os.path.islink(path): + msg = f"Invalid path which is a soft link." + raise RuntimeError(msg) + base_name = os.path.basename(path) + if not os.access(path, os.W_OK): + msg = f"The path permission check failed: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_path_readable(cls, path): + """ + Function Description: + check whether the path is writable + Parameter: + path: the path to check + Exception Description: + when invalid data throw exception + """ + cls.check_path_owner_consistent(path) + if os.path.islink(path): + msg = f"Invalid path which is a soft link." + raise RuntimeError(msg) + base_name = os.path.basename(path) + if not os.access(path, os.R_OK): + msg = f"The path permission check failed: {base_name}" + raise RuntimeError(msg) + + @classmethod + def remove_path_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to remove path: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + try: + shutil.rmtree(path) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def make_dir_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to make directory: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.makedirs(path, mode=cls.DATA_DIR_AUTHORITY) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def create_file_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to create file: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.close(os.open(path, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY)) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def get_realpath(cls, path: str) -> str: + if os.path.islink(path): + msg = f"Invalid input path which is a soft link." + raise RuntimeError(msg) + return os.path.realpath(path) diff --git a/profiler/cluster_analyse_review/common_func/sql_extention_func.py b/profiler/cluster_analyse_review/common_func/sql_extention_func.py new file mode 100644 index 0000000000000000000000000000000000000000..987a0d4365307704d6abf32575a48cc15c0fa33d --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/sql_extention_func.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class Median: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.median(self.data) + + +class LowerQuartile: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.quantile(self.data, 0.25) + + +class UpperQuartile: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.quantile(self.data, 0.75) + + +class StandardDeviation: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.std(self.data) + + +# func_name, params_count, class +SqlExtentionAggregateFunc = [ + ('median', 1, Median), + ('lower_quartile', 1, LowerQuartile), + ('upper_quartile', 1, UpperQuartile), + ('stdev', 1, StandardDeviation) +] diff --git a/profiler/cluster_analyse_review/common_func/table_constant.py b/profiler/cluster_analyse_review/common_func/table_constant.py new file mode 100644 index 0000000000000000000000000000000000000000..de6d47e97e5683493905de5353a9978195e87b70 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/table_constant.py @@ -0,0 +1,27 @@ +class TableConstant: + + RANK_SET = "rank_set" + STEP = "step" + RANK_ID = "rank_id" + TYPE = "type" + HCCL_OP_NAME = "hccl_op_name" + GROUP_NAME = "group_name" + START_TIMESTAMP = "start_timestamp" + ELAPSED_TIME = "elapse_time" + TRANSIT_TIME = "transit_time" + WAIT_TIME = "wait_time" + SYNCHRONIZATION_TIME = "synchronization_time" + IDLE_TIME = "idle_time" + SYNCHRONIZATION_TIME_RATIO = "synchronization_time_ratio" + WAIT_TIME_RATIO = "wait_time_ratio" + BAND_TYPE = "band_type" + TRANSIT_SIZE = "transit_size" + BANDWIDTH = "bandwidth" + LARGE_PACKET_RATIO = "large_packet_ratio" + PACKAGE_SIZE = "package_size" + COUNT = "count" + TOTAL_DURATION = "total_duration" + SRC_RANK = "src_rank" + DST_RANK = "dst_rank" + TRANSPORT_TYPE = "transport_type" + OPNAME = "op_name" diff --git a/profiler/cluster_analyse_review/common_func/tables_config.py b/profiler/cluster_analyse_review/common_func/tables_config.py new file mode 100644 index 0000000000000000000000000000000000000000..f010014519f864e627f83b99ad0df26af98af3f9 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/tables_config.py @@ -0,0 +1,73 @@ +class TablesConfig: + DATA = { + "ClusterCommAnalyzerTimeMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("rank_id", "INTEGER, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("start_timestamp", "NUMERIC, null"), + ("elapsed_time", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("wait_time", "NUMERIC, null"), + ("synchronization_time", "NUMERIC, null"), + ("idle_time", "NUMERIC, null"), + ("synchronization_time_ratio", "NUMERIC, null"), + ("wait_time_ratio", "NUMERIC, null") + ], + "CommunicationGroupMap": [ + ("type", "TEXT, null"), + ("rank_set", "TEXT, null") + ], + "ClusterCommAnalyzerBandwidthMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("rank_id", "INTEGER, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("band_type", "TEXT, null"), + ("transit_size", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("bandwidth", "NUMERIC, null"), + ("large_packet_ratio", "NUMERIC, null"), + ("package_size", "NUMERIC, null"), + ("count", "NUMERIC, null"), + ("total_duration", "NUMERIC, null") + ], + "ClusterCommAnalyzerMatrixMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("src_rank", "TEXT, null"), + ("dst_rank", "TEXT, null"), + ("transit_size", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("bandwidth", "NUMERIC, null"), + ("transport_type", "TEXT, null"), + ("op_name", "TEXT, null") + ], + "ClusterStepTraceTimeMap": [ + ("step", "TEXT, null"), + ("type", "TEXT, null"), + ("index", "TEXT, null"), + ("computing", "NUMERIC, null"), + ("communication_not_overlapped", "NUMERIC, null"), + ("overlapped", "NUMERIC, null"), + ("communication", "NUMERIC, null"), + ("free", "NUMERIC, null"), + ("stage", "NUMERIC, null"), + ("bubble", "NUMERIC, null"), + ("communication_not_overlapped_and_exclude_receive", "NUMERIC, null"), + ("preparing", "NUMERIC, null") + ], + "HostInfoMap": [ + ("hostUid", "INTEGER, null"), + ("hostName", "TEXT, null") + ], + "RankDeviceMapMap": [ + ("rankId", "INTEGER, null"), + ("deviceId", "INTEGER, null"), + ("hostUid", "INTEGER, null") + ] + } diff --git a/profiler/cluster_analyse_review/common_func/utils.py b/profiler/cluster_analyse_review/common_func/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0a20a5c237f9f46e7b7425ef4b295dad4656174e --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/utils.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + + +def format_columns(df: pd.DataFrame): + formatted_df = df.rename( + { + "25%": "Q1Ns", + "50%": "MedianNs", + "75%": "Q3Ns", + 0.25: "Q1Ns", + 0.5: "MedianNs", + 0.75: "Q3Ns", + "Q1": "Q1Ns", + "Q3": "Q3Ns", + "min": "MinNs", + "max": "MaxNs", + "median": "MedianNs", + "sum": "SumNs", + "std": "StdNs", + "mean": "MeanNs", + "count": "Count" + }, + axis="columns" + ) + + stats_cols = ["Count", "MeanNs", "StdNs", "MinNs", "Q1Ns", "MedianNs", "Q3Ns", "MaxNs", "SumNs"] + other_cols = [col for col in formatted_df.columns if col not in stats_cols] + return formatted_df[stats_cols + other_cols] + + +def describe_duration(series_groupby): + agg_df = series_groupby.agg(["min", "max", "count", "std", "mean", "sum"]) + quantile_df = series_groupby.quantile([0.25, 0.5, 0.75]) + + quantile_df = quantile_df.unstack() + quantile_df.columns = ["25%", "50%", "75%"] + + stats_df = pd.merge(agg_df, quantile_df, left_index=True, right_index=True) + formated_df = format_columns(stats_df) + formated_df.index.name = stats_df.index.name + return formated_df + + +def stdev(df, aggregated): + if len(df) <= 1: + return df["stdevNs"].iloc[0] + instance = aggregated["totalCount"].loc[df.name] + var_sum = np.dot(df["totalCount"] - 1, df["stdev"] ** 2) + deviation = df["averageNs"] - aggregated["averageNs"].loc[df.name] + dev_sum = np.dot(df["totalCount"], deviation ** 2) + return np.sqrt((var_sum + dev_sum) / (instance - 1)) + + +def convert_unit(df: pd.DataFrame, src_unit, dst_unit): + df.loc[:, df.columns.str.endswith(src_unit)] = df.loc[:, df.columns.str.endswith(src_unit)].apply(lambda x: x / 1000.0) + df = df.rename(columns=lambda x: x.replace(src_unit, "".join(["(", dst_unit, ")"]))) + return df diff --git a/profiler/cluster_analyse_review/communication_group/__init__.py b/profiler/cluster_analyse_review/communication_group/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/communication_group/base_communication_group.py b/profiler/cluster_analyse_review/communication_group/base_communication_group.py new file mode 100644 index 0000000000000000000000000000000000000000..23d7cb2986814e6e8cb45ac4ee9003f227ac881f --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/base_communication_group.py @@ -0,0 +1,227 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from collections import defaultdict +from copy import deepcopy +from multiprocessing import Pool + +from common_func.constant import Constant +from cluster_utils.data_transfer_adapter import DataTransferAdapter + + +class BaseCommunicationGroup: + def __init__(self, params: dict): + self.collection_path = params.get(Constant.COLLECTION_PATH) + self.data_map = params.get(Constant.DATA_MAP) + self.data_type = params.get(Constant.DATA_TYPE) + self.analysis_mode = params.get(Constant.ANALYSIS_MODE) + self.rank_comm_dir_dict = {} + self.p2p_link = [] + self.collective_group_dict = defaultdict(set) + self.p2p_comm_group = [] + self.communication_group = {} + self.communication_ops = [] + self.matrix_ops = [] + self.adapter = DataTransferAdapter() + + def load_communication_data(self): + comm_op_dirs = [] + for rank_id, profiling_dir_path in self.data_map.items(): + if self.data_type == Constant.TEXT: + comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON) + matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON) + else: + comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.DB_COMMUNICATION_ANALYZER) + matrix_dir = comm_dir + if os.path.exists(comm_dir) or os.path.exists(matrix_dir): + comm_op_dirs.append((rank_id, comm_dir, matrix_dir)) + else: + print( + f"[WARNING] Rank {rank_id} does not have valid communication data and communication_matrix data.") + with Pool() as p: + self.rank_comm_dir_dict = p.map(self.read_communication_func, comm_op_dirs) + + def set_p2p_groups(self): + self.p2p_link = sorted(self.p2p_link, key=lambda x: min(x)) + while self.p2p_link: + union_set = deepcopy(self.p2p_link[0]) + rm_list = [self.p2p_link[0]] + for idx, link_rank_set_x in enumerate(self.p2p_link[1:]): + if UnionFind.is_connected(link_rank_set_x, union_set): + union_set = union_set.union(link_rank_set_x) + rm_list.append(link_rank_set_x) + self.p2p_comm_group.append(union_set) + self.p2p_link = [element for element in self.p2p_link if element not in rm_list] + + def generate_collective_communication_group(self): + self.communication_group[Constant.COLLECTIVE] = \ + [list(group) for group_name, group in self.collective_group_dict.items()] + + def generate_p2p_communication_group(self): + stage_group = {} + for group_name, rank_set in self.collective_group_dict.items(): + if not self.whether_valid_comm_group(rank_set): + continue + unioned_set = set() + remove_key = [] + for first_rank, stage in stage_group.items(): + if UnionFind.is_connected(rank_set, stage): + unioned_set = UnionFind.union(rank_set, stage, unioned_set) + remove_key.append(first_rank) + if unioned_set: + for key in remove_key: + del stage_group[key] + stage_group[min(unioned_set)] = unioned_set + else: + stage_group[min(rank_set)] = rank_set + first_rank_sort_list = sorted([first_rank for first_rank in stage_group]) + self.communication_group[Constant.P2P] = \ + [list(stage_group.get(first_rank, {})) for first_rank in first_rank_sort_list] + + def whether_valid_comm_group(self, rank_set: set): + """ + while distinguish which communication group should be used to infer stage info, these group should be ignored: + 1. group can not include more than 1 rank in every single p2p group + """ + for p2p_rank_set in self.p2p_comm_group: + if len(rank_set.intersection(p2p_rank_set)) > 1: + return False + return True + + @abstractmethod + def read_communication_func(self, params: tuple): + pass + + def analyze_communication_data(self): + for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict: + for step_id, step_id_dict in rank_id_comm_dict.items(): + if not isinstance(step_id_dict, dict): + print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.") + continue + self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) + for comm_op_type, comm_op_dict in step_id_dict.items(): + self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict) + + for step_id, step_id_dict in rank_id_matrix_dict.items(): + if not isinstance(step_id_dict, dict): + print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.") + continue + self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict) + self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) + + @abstractmethod + def dump_data(self): + pass + + def collect_comm_data(self): + comm_data_dict = { + Constant.COLLECTIVE_GROUP: self.collective_group_dict, + Constant.COMMUNICATION_OPS: self.communication_ops, + Constant.MATRIX_OPS: self.matrix_ops, + Constant.COMMUNICATION_GROUP: self.communication_group + } + return comm_data_dict + + def generate(self): + self.load_communication_data() + self.analyze_communication_data() + self.set_p2p_groups() + self.generate_collective_communication_group() + self.generate_p2p_communication_group() + self.dump_data() + return self.collect_comm_data() + + def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict): + ops = rank_id_matrix_dict.get(step_id, {}) + self.add_matrix_ops(rank_id, step_id, ops) + if not ops: + print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.") + return + p2p_ops = ops.get(Constant.P2P, {}) + for op_name, link_dict in p2p_ops.items(): + self.append_p2p_link(op_name, link_dict) + + def append_p2p_link(self, op_name, link_dict): + for link in link_dict: + if '-' not in link: + print(f"[WARNING] {op_name} has an invalid link key {link}!") + break + src_rank = int(link.split('-')[0]) + dst_rank = int(link.split('-')[1]) + if src_rank != dst_rank: + rank_set = {src_rank, dst_rank} + if rank_set in self.p2p_link: + continue + self.p2p_link.append(rank_set) + + def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict): + for comm_op in comm_op_dict: + if comm_op.startswith('Total'): + continue + group_name = comm_op.split('@')[-1] + self.collective_group_dict[group_name].add(rank_id) + + def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict): + for comm_op in comm_op_dict: + if comm_op.startswith('Total'): + continue + group_name = comm_op.split('@')[-1] + self.communication_ops.append({ + Constant.RANK_ID: rank_id, + Constant.STEP_ID: step_id, + Constant.COMM_OP_TYPE: comm_op_type, + Constant.COMM_OP_NAME: comm_op, + Constant.GROUP_NAME: group_name, + Constant.COMM_OP_INFO: comm_op_dict.get(comm_op) + }) + + def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict): + for comm_op_type, comm_dict in step_id_dict.items(): + if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P: + print(f"[WARNING] Unknown communication operators type!") + continue + for op_name, op_link_info in comm_dict.items(): + if op_name.startswith('Total'): + continue + group_name = op_name.split('@')[-1] + self.matrix_ops.append({ + Constant.RANK_ID: rank_id, + Constant.STEP_ID: step_id, + Constant.COMM_OP_TYPE: comm_op_type, + Constant.COMM_OP_NAME: op_name, + Constant.GROUP_NAME: group_name, + Constant.COMM_OP_INFO: op_link_info + }) + + +class UnionFind(object): + """Disjoint Set Union""" + + @classmethod + def union(cls, first_set: set, second_set: set, third_set: set): + """make p and q the same set""" + return first_set | second_set | third_set + + @classmethod + def is_connected(cls, first_set: set, second_set: set): + """ + check whether set p and set q are connected + """ + if first_set & second_set: + return True + else: + return False diff --git a/profiler/cluster_analyse_review/communication_group/communication_db_group.py b/profiler/cluster_analyse_review/communication_group/communication_db_group.py new file mode 100644 index 0000000000000000000000000000000000000000..510dcd971357dfb4798e4d284a72fbb3f3a21859 --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/communication_db_group.py @@ -0,0 +1,57 @@ +import os + +from common_func.db_manager import DBManager +from common_func.constant import Constant +from communication_group.base_communication_group import BaseCommunicationGroup + + +class CommunicationDBGroup(BaseCommunicationGroup): + COMMUNICATION_GROUP_TABLE = "CommunicationGroup" + + def __init__(self, params: dict): + super().__init__(params) + + def read_communication_func(self, params: tuple): + if len(params) < 3: + return -1, ({}, {}, {}) + rank_id = params[0] + db_path = params[1] + time_data = [] + bandwidth_data = [] + matrix_data = [] + if os.path.exists(db_path): + conn, cursor = DBManager.create_connect_db(db_path) + time_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_TIME) + bandwidth_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_BANDWIDTH) + matrix_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_MATRIX) + if (DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_TIME, + Constant.TABLE_COMM_ANALYZER_BANDWIDTH) + and self.analysis_mode in ["all", "communication_time"]): + time_data = DBManager.fetch_all_data(cursor, time_info_sql) + bandwidth_data = DBManager.fetch_all_data(cursor, bandwidth_info_sql) + if (DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_MATRIX) + and self.analysis_mode in ["all", "communication_matrix"]): + matrix_data = DBManager.fetch_all_data(cursor, matrix_info_sql) + DBManager.destroy_db_connect(conn, cursor) + comm_data = self.adapter.transfer_comm_from_db_to_json(time_data, bandwidth_data) + comm_matrix_data = self.adapter.transfer_matrix_from_db_to_json(matrix_data) + return rank_id, comm_data, comm_matrix_data + + def dump_data(self): + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + res = [] + for data_type, data_list in self.communication_group.items(): + for data in data_list: + rank_set = "(" + ",".join(str(i) for i in data) + ")" + data = [data_type, rank_set] + res.append(data) + if res: + DBManager.create_tables(result_db, self.COMMUNICATION_GROUP_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + sql = "insert into {} values ({value})".format(self.COMMUNICATION_GROUP_TABLE, + value="?," * (len(res[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res) + DBManager.destroy_db_connect(conn, cursor) + else: + print("[WARNING] The CommunicationGroup table won't be created because no data has been calculated.") diff --git a/profiler/cluster_analyse_review/communication_group/communication_group_generator.py b/profiler/cluster_analyse_review/communication_group/communication_group_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..3dca90454b608fe3ffb1c365854c2aa3950b6cee --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/communication_group_generator.py @@ -0,0 +1,32 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from common_func.constant import Constant +from communication_group.communication_db_group import CommunicationDBGroup +from communication_group.communication_json_group import CommunicationJsonGroup + + +class CommunicationGroupGenerator: + + GROUP_MAP = { + Constant.DB: CommunicationDBGroup, + Constant.TEXT: CommunicationJsonGroup + } + + def __init__(self, params: dict): + self.processor = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params) + + def generate(self): + return self.processor.generate() diff --git a/profiler/cluster_analyse_review/communication_group/communication_json_group.py b/profiler/cluster_analyse_review/communication_group/communication_json_group.py new file mode 100644 index 0000000000000000000000000000000000000000..f6e01e3abfde4d8f180043a5bf9a50c6b5a4964c --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/communication_json_group.py @@ -0,0 +1,44 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from common_func.constant import Constant +from common_func.file_manager import FileManager +from communication_group.base_communication_group import BaseCommunicationGroup + + +class CommunicationJsonGroup(BaseCommunicationGroup): + COMMUNICATION_GROUP_JSON = "communication_group.json" + + def __init__(self, params: dict): + super().__init__(params) + + def dump_data(self): + FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON) + + def read_communication_func(self: any, params: tuple): + if len(params) < 3: + return -1, {}, {} + rank_id = params[0] + comm_json_path = params[1] + matrix_json_path = params[2] + comm_data = {} + matrix_data = {} + if os.path.exists(comm_json_path) and self.analysis_mode in ["all", "communication_time"]: + comm_data = FileManager.read_json_file(comm_json_path) + if os.path.exists(matrix_json_path) and self.analysis_mode in ["all", "communication_matrix"]: + matrix_data = FileManager.read_json_file(matrix_json_path) + return rank_id, comm_data, matrix_data diff --git a/profiler/cluster_analyse_review/prof_bean/__init__.py b/profiler/cluster_analyse_review/prof_bean/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/prof_bean/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/prof_bean/step_trace_time_bean.py b/profiler/cluster_analyse_review/prof_bean/step_trace_time_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..b0a3be4f5eaccea70aa912bc85e68d70dbda3bde --- /dev/null +++ b/profiler/cluster_analyse_review/prof_bean/step_trace_time_bean.py @@ -0,0 +1,39 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class StepTraceTimeBean: + STEP = "Step" + COMPLEMENT_HEADER = ["Step", "Type", "Index"] + + def __init__(self, data: list): + self._data = data + + @property + def row(self) -> list: + row = [] + for field_name in self._data.keys(): + if field_name == self.STEP: + continue + row.append(float(self._data.get(field_name, ))) + return row + + @property + def step(self) -> str: + return self._data.get(self.STEP, '') + + @property + def all_headers(self) -> list: + return self.COMPLEMENT_HEADER + list(self._data.keys())[1:] diff --git a/profiler/cluster_analyse_review/resources/.keep b/profiler/cluster_analyse_review/resources/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/README.md b/profiler/compare_tools_review/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be56b4d993ce20e0778611c53af41421d5577287 --- /dev/null +++ b/profiler/compare_tools_review/README.md @@ -0,0 +1,301 @@ +# 性能比对工具 + +compare_tools(性能比对工具)支持比较GPU与NPU之间、NPU与NPU之间的性能差异,通过对训练耗时和内存占用的比对分析,定位到具体劣化的算子,帮助用户提升性能调优的效率。工具将训练耗时拆分为计算、通信、调度三大维度,并针对计算和通信分别进行算子级别的比对;将训练占用的总内存,拆分成算子级别的内存占用进行比对。 + +## 使用场景 + +场景一:PyTorch训练工程从GPU迁移至NPU后出现性能劣化,通过工具分析出劣化点。 + +场景二:PyTorch或MindSpore训练工程在NPU上,不同版本之间存在性能差距,通过工具定位具体差异。 + +场景三:PyTorch训练工程从GPU迁移至MindSpore NPU后出现性能劣化,通过工具分析出劣化点。 + +## 使用指导 + +### 环境依赖 + +使用本工具前需要安装的依赖包: + +```bash +pip3 install prettytable +pip3 install xlsxwriter +pip3 install pandas +pip3 install numpy +``` + +### PyTorch框架性能数据采集 + +使用本工具之前需要采集GPU或者NPU的性能数据,建议只采集一个step的性能数据,然后进行性能比对分析。 + +#### GPU性能数据采集 + +通过PyTorch Profiler工具采集GPU的性能数据,参考链接:[torch.profiler](https://pytorch.org/docs/stable/profiler.html)。 + +采集样例代码参考一: + +```Python +with torch.profiler.profile( + profile_memory=True, # 内存数据采集的开关 + record_shapes=True, # 算子input shape信息采集的开关 + schedule=torch.profiler.schedule(wait=10, warmup=0, active=1, repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler("./result_dir") +) as prof: + for step in ranges(step_number): + train_one_step() + prof.step() +``` + +采集样例代码参考二: + +```Python +prof = torch.profiler.profile( + profile_memory=True, # 内存数据采集的开关 + record_shapes=True, # 算子input shape信息采集的开关 + on_trace_ready=torch.profiler.tensorboard_trace_handler("./result_dir")) +for step in range(step_number): + if step == 11: + prof.start() + train_one_step() + if step == 11: + prof.stop() +``` + +PyTorch Profiler采集结果数据目录结构如下: + +```Python +|- pytorch_profiling + |- *.pt.trace.json +``` + +#### NPU性能数据采集 + +通过Ascend PyTorch Profiler工具采集NPU的性能数据,采集参数配置与GPU基本一致,只需将GPU的性能数据采集代码中torch.profiler替换成torch_npu.profiler。,参考链接:[Profiling数据采集](https://gitee.com/ascend/att/tree/master/profiler)。 + +Ascend PyTorch Profiler采集结果数据目录结构如下: + +```bash +|- ascend_pytorch_profiling + |- * _ascend_pt + |- ASCEND_PROFILER_OUTPUT + |- trace_view.json + |- FRAMEWORK + |- PROF_XXX + |- * _ascend_pt +``` + +### MindSpore框架性能数据采集 + +#### NPU性能数据采集 + +当前MindSpore场景仅支持NPU环境性能数据与PyTorch GPU性能数据进行比对;以及MindSpore训练工程在NPU上,不同版本之间的性能数据进行比对。 + +通过MindSpore性能调试工具采集NPU的性能数据,建议只采集或只解析一个step的性能数据,参考链接:[性能调试(Ascend)](https://www.mindspore.cn/mindinsight/docs/zh-CN/r2.3/performance_profiling_ascend.html)。 + +MindSpore性能调试工具采集结果数据目录结构如下: + +``` +|- profiler/{rank-*}_{timestamps}_ascend_ms + |- ASCEND_PROFILER_OUTPUT + |- kernel_details.csv + |- trace_view.json +``` + +进行性能比对时,MindSpore采集的性能数据须指定到`profiler/{rank-*}_{timestamps}_ascend_ms`或`ASCEND_PROFILER_OUTPUT`层级。 + +### 性能数据比对 + +性能比对工具将总体性能拆解为训练耗时和内存占用,其中训练耗时可拆分为算子(包括算子和nn.Module)、通信、调度三个维度,以打屏的形式输出总体指标,帮助用户定界劣化的方向。与此同时,工具还会生成performance_comparison_result_*.xlsx,展示每个算子在执行耗时、通信耗时、内存占用的优劣,可通过DIFF列大于0筛选出劣化算子。详细介绍请参见“**比对结果说明**”。 + +性能比对工具支持使用**命令行**和**脚本**两种方式执行性能数据比对操作,这两种方式均支持**通用参数**和**算子性能比对特有参数**。 + +#### 命令行方式 + +1. 参见《[性能工具](../README.md)》完成工具安装。 + +2. 执行如下命令进行性能数据比对: + + ``` + msprof-analyze compare -d [比对性能数据文件所在路径] -bp [基准性能数据文件所在路径] --output_path=[比对结果文件存放路径] + ``` + + - -d(必选):比对性能数据文件所在路径。可以指定以“ascend_pt”结尾的目录、ASCEND_PROFILER_OUTPUT目录或trace_view.json文件,指定trace_view.json无法显示算子的内存占用。 + - -bp(必选):基准性能数据文件所在路径。基准性能数据文件若以GPU为基准,指定到以".pt.trace"结尾的json文件;若以NPU不同版本为基准,指定文件与-d一致。 + - --output_path(可选):性能比对结果存放的路径,默认保存在当前目录。 + +#### 脚本方式 + +将att代码仓下载到本地,执行如下命令: + +```bash +# 进入att代码仓目录下的compare_tools目录 +cd att/profiler/compare_tools +# 执行最简比对命令 +python performance_compare.py [基准性能数据文件所在路径] [比对性能数据文件所在路径] --output_path=[比对结果文件存放路径] +``` + +- 基准性能数据文件所在路径(必选):若以GPU为基准,指定到以".pt.trace"结尾的json文件;若以NPU不同版本为基准,指定文件参考**比对性能数据文件所在路径**。 +- 比对性能数据文件所在路径(必选):可以指定以“ascend_pt”结尾的目录、ASCEND_PROFILER_OUTPUT目录或trace_view.json文件,指定trace_view.json无法显示算子的内存占用。 +- --output_path(可选):性能比对结果存放的路径,默认保存在当前目录。 + +#### 通用参数说明 + +| 参数名 | 说明 | 是否必选 | +| ------------------------------ |---------------------------------------------------| -------- | +| --enable_profiling_compare | 开启总体性能比对。 | 否 | +| --enable_operator_compare | 开启算子性能比对。MindSpore场景暂不支持。该开关较耗时,建议只采集一个step的性能数据。 | 否 | +| --enable_communication_compare | 开启通信性能比对。 | 否 | +| --enable_memory_compare | 开启算子内存比对。MindSpore场景暂不支持。该开关较耗时,建议只采集一个step的性能数据。 | 否 | + +说明:以上4个开关均不设置的情况下,**工具默认开启所有的性能比对**,当用户设置了以上开关,则按照用户设置的开关进行性能比对,示例如下: + +```bash +msprof-analyze compare -d [比对性能数据文件所在路径] -bp [基准性能数据文件所在路径] --output_path=./result_dir --enable_profiling_compare +``` + +或 + +```bash +python performance_compare.py [基准性能数据文件] [比对性能数据文件] --output_path=./result_dir --enable_profiling_compare +``` + +此时表示仅开启总体性能比对。 + +#### 算子性能比对特有参数说明 + +| 参数名 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| --gpu_flow_cat | 配置GPU trace中CPU侧算子与device kernel的连线标识,当GPU的Device Duration(us)均为0时设置。使用chrome://tracing打开GPU的json,右上角Flow events找到连线标识,将标识配置进该参数。使用示例:--gpu_flow_cat=async_gpu | 否 | +| --use_input_shape | 开启算子精准匹配,默认关闭。使用示例:--use_input_shape | 否 | +| --max_kernel_num | 设置CPU侧算子下发的最大kernel数量,当超过设定值时工具会自动往下找子算子,直至满足条件。默认仅比对最上层算子,粒度较粗;若想要更细粒度的算子比对,可设置该参数,参数值不得小于4,参数值设置越小,比对粒度越细。使用示例:--max_kernel_num=10 | 否 | +| --op_name_map | 设置GPU与NPU等价的算子名称的映射关系,以字典形式存入。使用示例:--op_name_map={'Optimizer.step#SGD.step':'Optimizer.step#NpuFusedSGD.step'} | 否 | + +## 比对结果说明 + +MindSpore场景仅支持**总体性能**和**通信性能**的对比。 + +### 总体性能 + +总体性能比对结果以打屏的形式呈现。 + +| 字段 | 说明 | +| --------------------------------------- | ------------------------------------------------------------ | +| Cube Time(Num) | Cube算子总耗时,Num表示计算的次数。 | +| Vector Time(Num) | Vector算子总耗时,Num表示计算的次数。 | +| Conv Time(Forward)(Num) | conv前向算子耗时,Num表示计算的次数。 | +| Conv Time(Backward)(Num) | conv反向算子耗时,Num表示计算的次数。 | +| Flash Attention Time(Forward)(Num) | Flash Attention算子前向耗时,Num表示计算的次数。 | +| Flash Attention Time(Backward)(Num) | Flash Attention算子反向耗时,Num表示计算的次数。 | +| Paged Attention Time(Num) | Paged Attention算子耗时,Num表示计算的次数。 | +| Lccl Time(Num) | Lccl算子耗时,Num表示计算的次数。 | +| Computing Time | 计算流耗时,计算流所有event耗时总和。如果有多条并发计算,计算流耗时对重叠部分只会计算一次。 | +| Mem Usage | 内存使用。GPU上的内存使用可以使用nvidia-smi查看,NPU上的内存使用可以使用npu-smi查看,Profiling信息采集时打开profile_memory=True开关,mem usage显示的是memory_record里面的最大resevered值,一般来说是进程级内存。 | +| Uncovered Communication Time(Wait Time) | 通信未掩盖耗时,包含Wait Time(只有采集性能数据的Level等级为L1以上并且采集NPU数据时才会存在)为同步时间。 | +| SDMA Time(Num) | 拷贝类任务耗时,Num表示计算的次数。 | +| Free Time | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间,因此包含拷贝时间(SDMA Time)。 | +| E2E Time(Not minimal profiling) | E2E总耗时,计算流端到端耗时。当存在Not minimal profiling时,表示该时间存在性能膨胀,会影响通信和调度耗时。 | +| Other Time | AI CPU、DSA、TensorMove等其他算子耗时。 | + +可以采取最简性能数据采集的方式来减少E2E耗时的性能膨胀,示例代码如下: + +```python +with torch_npu.profiler.profile( + activities=[torch_npu.profiler.ProfilerActivity.NPU], + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=1, repeat=1, skip_first=10), + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result"), +) as prof: + for step in range(steps): + train_one_step() + prof.step() +``` + +activities配置仅采集NPU数据,不配置experimental_config参数以及其他可选开关。 + +- 当Computing Time耗时增大,分析**算子性能**。 +- 当Uncovered Communication Time耗时增大,分析**通信性能**,若通信性能分析没有劣化的通信算子,代表通信与计算的并行度较差,继续进行NPU的集群性能分析。 +- 当Mem Usage增大,分析**算子内存**,若没有明显占用较大的算子,则代表算子内存申请量没有差异,问题在于内存的释放(持有时间过久),可以使用tensorboard或ascend insight继续进行NPU内存的分析。 + +### 算子性能 + +MindSpore场景暂不支持。 + +#### 比对数据无Python Function + +算子性能比对结果在performance_comparison_result_*.xlsx中OperatorCompare和OperatorCompareStatistic的sheet页呈现。 + +- OperatorCompareStatistic:算子为粒度的统计呈现,按照算子在device上的总耗时与基准算子的差距值(Diff Duration(ms)列)进行逆序。 +- OperatorCompare:算子比对的明细展示,可以查看每一个算子对应的kernel详情。 +- Diff Ratio:比较算子在device上执行总耗时 / 基准算子在device上执行总耗时,红色代表劣化。 +- Device Duration(us):该算子下发到device上执行的所有kernel耗时的总和。 + +步骤1:查看OperatorCompareStatistic页,找出耗时差距TOP的算子。 +步骤2:查看OperatorCompare页,搜索耗时差距TOP的算子,查看具体执行的kernel耗时,寻找可优化点。 + +#### 比对数据有Python Function + +算子性能比对结果在performance_comparison_result_*.xlsx中ModuleCompareStatistic、ModuleCompare的sheet页呈现。 + +当用户采集时开启with_stack开关,会上报python function事件,当比对的双方数据都存在python function的事件时,可进行模块级别的比对。 + +- Module Class:Module名,如nn.Module: Linear。 +- Module Level:Module的层级。 +- Module Name:Module唯一标识名,如/ DynamicNet_0/ Linear_0。 +- Operator Name:框架侧算子名,如aten::add。字段为[ TOTAL ]代表该module的总体情况。 +- Kernel Detail:算子详细信息。 +- Device Self Time(ms):该模块调用的算子(排除子模块)在device侧执行的总耗时,单位ms。 +- Number:该Module或算子被调用的次数。 +- Device Total Time(ms):该模块调用的算子(包含子模块)在device侧执行的总耗时,单位ms。 +- Device Total Time Diff(ms):GPU与NPU的Device Total Time(ms)差值。 +- Device Self Time Diff(ms):GPU与NPU的Device Self Time(ms)差值。 +- Total Time Ratio:GPU与NPU的Device Total Time(ms)比值。 +- Base Call Stack:基准文件模块的调用栈。 +- Comparison Call Stack:比较文件模块的调用栈。 + +ModuleCompare:模块及模块下算子比对的明细展示,可以查看每一个算子对应的kernel详情。 + +- Module Class:Module名,如nn.Module: Linear。 +- Module Level:Module的层级。 +- Module Name:Module唯一标识名,如/ DynamicNet_0/ Linear_0。 +- Operator Name:框架侧算子名,如aten::add。字段为[ TOTAL ]代表该module的总体情况。 +- Kernel Detail:算子详细信息。 +- Device Self Time(us):该模块调用的算子(排除子模块)在device侧执行的总耗时,单位us。 +- Device Total Time(us):该模块调用的算子(包含子模块)在device侧执行的总耗时,单位us。 +- Device Total Time Diff(us):GPU与NPU的Device Total Time(us)差值。 +- Device Self Time Diff(us):GPU与NPU的Device Self Time(us)差值。 +- Total Time Ratio:GPU与NPU的Device Total Time(us)比值。 +- Base Call Stack:有劣化的模块或算子,基准文件模块的调用栈。 +- Comparison Call Stack:有劣化的模块或算子,比较文件模块的调用栈。 + +步骤1:查看ModuleCompareStatistic页,找出耗时差距TOP的模块。 + + 筛选Operator Name字段为[ TOTAL ],将模块总体情况按照Device Self Time(ms)字段逆序,可识别出耗时差距TOP的模块。 + + 恢复数据,可按照Order Id字段升序。 + +步骤2:查看ModuleCompare页,查找耗时差距TOP模块下的劣化算子。 + +步骤3:通过调用栈找到对应的代码行。 + +### 通信性能 + +通信性能比对结果在performance_comparison_result_*.xlsx中CommunicationCompare的sheet页呈现。 + +- 第二行表头:通信算子的summary信息,包括通信算子名称、调用总次数、通信算子总耗时(单位:us)、通信算子平均耗时(单位:us)、通信算子最大耗时(单位:us)、通信算子最小耗时(单位:us)。 +- 无背景色的记录行:通信算子的detail信息,仅支持NPU,包含了该通信算子下的所有Task信息,包括Task名称、Task调用次数、Task总耗时(单位:us)、Task平均耗时(单位:us)、Task最大耗时(单位:us)、Task最小耗时(单位:us)。 +- Diff Ratio: 比较通信算子的总耗时 / 基准通信算子的总耗时,红色代表劣化。 + +### 算子内存 + +MindSpore场景暂不支持。 + +算子内存比对结果在performance_comparison_result_*.xlsx中MemoryCompare和MemoryCompareStatistic的sheet页呈现。 + +- MemoryCompareStatistic:算子为粒度的统计呈现,按照算子占用的总内存与基准算子的差距值(Diff Memory(MB))进行逆序。 + +- MemoryCompare:算子内存比对的明细展示,可以查看每一个算子申请内存的详情。 + +- Diff Ratio: 比较算子占用的总内存 / 基准算子占用的总内存,红色代表劣化。 + +- Size(KB):该算子占用的device内存大小,单位KB。 + +步骤1:查看MemoryCompareStatistic页,找出内存占用差距TOP的算子。 +步骤2:查看MemoryCompare页,搜索内存占用差距TOP的算子,查看具体占用的子算子。 diff --git a/profiler/compare_tools_review/__init__.py b/profiler/compare_tools_review/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/compare_tools_review/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/compare_tools_review/compare_backend/__init__.py b/profiler/compare_tools_review/compare_backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/comparator/__init__.py b/profiler/compare_tools_review/compare_backend/comparator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/comparator/base_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/base_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..330fb871ee19b9bac1c0dfff4cae5648ebeedf1c --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/base_comparator.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod + + +class BaseComparator(ABC): + def __init__(self, origin_data: any, bean: any): + self._sheet_name = bean.TABLE_NAME + self._headers = bean.HEADERS + self._overhead = bean.OVERHEAD + self._origin_data = origin_data + self._bean = bean + self._rows = [] + + def generate_data(self) -> dict: + ''' + generate one sheet(table) data + type: dict + sheet name as the dict key + ''' + self._compare() + return {self._sheet_name: {"headers": self._headers, "rows": self._rows, "overhead": self._overhead}} + + @abstractmethod + def _compare(self): + raise NotImplementedError("Function _compare need to be implemented.") diff --git a/profiler/compare_tools_review/compare_backend/comparator/communication_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/communication_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..f7580bec89a85b8d23e0ec878eda944d95e69f3f --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/communication_comparator.py @@ -0,0 +1,20 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.compare_bean.communication_bean import CommunicationBean +from compare_backend.utils.constant import Constant +from compare_backend.utils.common_func import update_order_id + + +class CommunicationComparator(BaseComparator): + def __init__(self, origin_data: dict, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + base_data = self._origin_data.get(Constant.BASE_DATA, {}) + comparison_data = self._origin_data.get(Constant.COMPARISON_DATA, {}) + for comm_name, comm_data in base_data.items(): + comparison_comm_data = comparison_data.pop(comm_name, {}) + self._rows.extend(CommunicationBean(comm_name, comm_data, comparison_comm_data).rows) + for comm_name, comm_data in comparison_data.items(): + self._rows.extend(CommunicationBean(comm_name, {}, comm_data).rows) + update_order_id(self._rows) + diff --git a/profiler/compare_tools_review/compare_backend/comparator/module_comparetor.py b/profiler/compare_tools_review/compare_backend/comparator/module_comparetor.py new file mode 100644 index 0000000000000000000000000000000000000000..49c50b53c5a1b00bd17b7281d80b61d5011cb59a --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/module_comparetor.py @@ -0,0 +1,36 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id +from compare_backend.utils.constant import Constant + + +class ModuleComparator(BaseComparator): + def __init__(self, origin_data: any, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_all_data = [data for data in self._origin_data if data[0]] # index 0 for base module + base_all_data.sort(key=lambda x: x[0].start_time) + base_none_data = [data for data in self._origin_data if not data[0]] # index 0 for base module + base_none_data.sort(key=lambda x: x[1].start_time) + index = 0 + for base_module, comparison_module in base_all_data: + if not comparison_module: + self._rows.extend(self._bean(base_module, comparison_module).rows) + continue + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + if module.start_time < comparison_module.start_time: + self._rows.extend(self._bean(None, module).rows) + index += 1 + else: + break + self._rows.extend(self._bean(base_module, comparison_module).rows) + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + self._rows.extend(self._bean(None, module).rows) + index += 1 + update_order_id(self._rows) + if not any(row[-1] != Constant.NA for row in self._rows): + print(f"[WARNING] If you want to see the operator's call stack, you must enable with_stack switch.") diff --git a/profiler/compare_tools_review/compare_backend/comparator/module_statistic_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/module_statistic_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..e09108f3cbe3744068daf6c5316dc318aea53177 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/module_statistic_comparator.py @@ -0,0 +1,45 @@ +from collections import OrderedDict + +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id + + +class ModuleStatisticComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_module_dict, comparison_module_dict = self._group_by_module_name() + for module_name, base_data in base_module_dict.items(): + comparison_data = comparison_module_dict.pop(module_name, []) + self._rows.extend(self._bean(module_name, base_data, comparison_data).rows) + for module_name, comparison_data in comparison_module_dict.items(): + self._rows.extend(self._bean(module_name, [], comparison_data).rows) + update_order_id(self._rows) + + def _group_by_module_name(self): + base_module_dict, comparison_module_dict = OrderedDict(), OrderedDict() + base_all_data = [data for data in self._origin_data if data[0]] # index 0 for base module + base_all_data.sort(key=lambda x: x[0].start_time) + base_none_data = [data for data in self._origin_data if not data[0]] # index 0 for base module + base_none_data.sort(key=lambda x: x[1].start_time) + index = 0 + for base_module, comparison_module in base_all_data: + base_module_dict.setdefault(base_module.module_name, []).append(base_module) + if not comparison_module: + continue + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + if module.start_time < comparison_module.start_time: + comparison_module_dict.setdefault(module.module_name, []).append(module) + index += 1 + else: + break + comparison_module_dict.setdefault(comparison_module.module_name, []).append(comparison_module) + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + comparison_module_dict.setdefault(module.module_name, []).append(module) + index += 1 + return base_module_dict, comparison_module_dict diff --git a/profiler/compare_tools_review/compare_backend/comparator/operator_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/operator_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..cc475116cab59104a049689292f25f339a7285ce --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/operator_comparator.py @@ -0,0 +1,13 @@ +from compare_backend.comparator.base_comparator import BaseComparator + + +class OperatorComparator(BaseComparator): + def __init__(self, origin_data: any, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + self._rows = [None] * (len(self._origin_data)) + for index, (base_op, comparison_op) in enumerate(self._origin_data): + self._rows[index] = self._bean(index, base_op, comparison_op).row diff --git a/profiler/compare_tools_review/compare_backend/comparator/operator_statistic_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/operator_statistic_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..73aecf6f1283242311bcb0e848bd94f0f1afa377 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/operator_statistic_comparator.py @@ -0,0 +1,28 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id + + +class OperatorStatisticComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_op_dict, comparison_op_dict = self._group_by_op_name() + for op_name, base_data in base_op_dict.items(): + comparison_data = comparison_op_dict.pop(op_name, []) + self._rows.append(self._bean(op_name, base_data, comparison_data).row) + for op_name, comparison_data in comparison_op_dict.items(): + self._rows.append(self._bean(op_name, [], comparison_data).row) + self._rows.sort(key=lambda x: x[-2], reverse=True) # order by diff column + update_order_id(self._rows) + + def _group_by_op_name(self): + base_op_dict, comparison_op_dict = {}, {} + for base_op, comparison_op in self._origin_data: + if base_op: + base_op_dict.setdefault(base_op.name, []).append(base_op) + if comparison_op: + comparison_op_dict.setdefault(comparison_op.name, []).append(comparison_op) + return base_op_dict, comparison_op_dict diff --git a/profiler/compare_tools_review/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/overall_performance_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..7283c17b47dea78058d0541c1332df0fa45e90d9 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/overall_performance_comparator.py @@ -0,0 +1,76 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.constant import Constant + + +class OverallPerformanceComparator(BaseComparator): + def __init__(self, origin_data: dict, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + base_profiling_info = self._origin_data.get(Constant.BASE_DATA) + comp_profiling_info = self._origin_data.get(Constant.COMPARISON_DATA) + self._headers = [''] + base_col = [f'{base_profiling_info.profiling_type}'] + comp_col = [f'{comp_profiling_info.profiling_type}'] + if not base_profiling_info.hide_op_details and not comp_profiling_info.hide_op_details: + self._headers.extend(['Cube Time(Num)', 'Vector Time(Num)']) + base_col.extend([f'{base_profiling_info.cube_time:.3f}s({base_profiling_info.cube_num})', + f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})']) + comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})', + f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})']) + if base_profiling_info.conv_time_fwd or comp_profiling_info.conv_time_fwd: + self._headers.append('Conv Time(Forward)(Num)') + base_col.append(f'{base_profiling_info.conv_time_fwd:.3f}s({base_profiling_info.conv_num_fwd})') + comp_col.append(f'{comp_profiling_info.conv_time_fwd:.3f}s({comp_profiling_info.conv_num_fwd})') + if base_profiling_info.conv_time_bwd or comp_profiling_info.conv_time_bwd: + self._headers.append('Conv Time(Backward)(Num)') + base_col.append(f'{base_profiling_info.conv_time_bwd:.3f}s({base_profiling_info.conv_num_bwd})') + comp_col.append(f'{comp_profiling_info.conv_time_bwd:.3f}s({comp_profiling_info.conv_num_bwd})') + if base_profiling_info.fa_time_fwd or comp_profiling_info.fa_time_fwd: + self._headers.append('Flash Attention Time(Forward)(Num)') + base_col.append(f'{base_profiling_info.fa_time_fwd:.3f}s({base_profiling_info.fa_num_fwd})') + comp_col.append(f'{comp_profiling_info.fa_time_fwd:.3f}s({comp_profiling_info.fa_num_fwd})') + if base_profiling_info.fa_time_bwd or comp_profiling_info.fa_time_bwd: + self._headers.append('Flash Attention Time(Backward)(Num)') + base_col.append(f'{base_profiling_info.fa_time_bwd:.3f}s({base_profiling_info.fa_num_bwd})') + comp_col.append(f'{comp_profiling_info.fa_time_bwd:.3f}s({comp_profiling_info.fa_num_bwd})') + if base_profiling_info.pa_time or comp_profiling_info.pa_time: + self._headers.append('Paged Attention Time(Num)') + base_col.append(f'{base_profiling_info.pa_time:.3f}s({base_profiling_info.pa_num})') + comp_col.append(f'{comp_profiling_info.pa_time:.3f}s({comp_profiling_info.pa_num})') + if base_profiling_info.lccl_time or comp_profiling_info.lccl_time: + self._headers.append('Lccl Time(Num)') + base_col.append(f'{base_profiling_info.lccl_time:.3f}s({base_profiling_info.lccl_num})') + comp_col.append(f'{comp_profiling_info.lccl_time:.3f}s({comp_profiling_info.lccl_num})') + if base_profiling_info.other_time or comp_profiling_info.other_time: + self._headers.append('Other Time') + base_col.append(f'{base_profiling_info.other_time:.3f}s') + comp_col.append(f'{comp_profiling_info.other_time:.3f}s') + self._headers.extend(['Computing Time']) + base_col.extend([f'{base_profiling_info.compute_time:.3f}s']) + comp_col.extend([f'{comp_profiling_info.compute_time:.3f}s']) + if base_profiling_info.memory_used or comp_profiling_info.memory_used: + self._headers.append('Mem Usage') + base_col.append(f'{base_profiling_info.memory_used:.2f}G') + comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') + self._headers.extend(['Uncovered Communication Time(Wait Time)']) + if base_profiling_info.wait_time: + base_col.extend( + [f'{base_profiling_info.communication_not_overlapped: .3f}s({base_profiling_info.wait_time:.3f}s)']) + else: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s( / )']) + if comp_profiling_info.is_level0: + comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s( / )']) + else: + comp_col.extend( + [f'{comp_profiling_info.communication_not_overlapped: .3f}s({comp_profiling_info.wait_time:.3f}s)']) + if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: + self._headers.append('SDMA Time(Num)') + base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') + comp_col.append(f'{comp_profiling_info.sdma_time:.3f}s({comp_profiling_info.sdma_num})') + cue = '(Not minimal profiling)' if base_profiling_info.is_not_minimal_profiling() or \ + comp_profiling_info.is_not_minimal_profiling() else '' + self._headers.extend(['Free Time', 'E2E Time' + cue]) + base_col.extend([f'{base_profiling_info.scheduling_time:.3f}s', f'{base_profiling_info.e2e_time:.3f}s']) + comp_col.extend([f'{comp_profiling_info.scheduling_time:.3f}s', f'{comp_profiling_info.e2e_time:.3f}s']) + self._rows = [base_col, comp_col] diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/__init__.py b/profiler/compare_tools_review/compare_backend/compare_bean/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/communication_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/communication_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..94813193d69b4a1f92cc88dbd1eb31d6f96ff608 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/communication_bean.py @@ -0,0 +1,72 @@ +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.common_func import calculate_diff_ratio + + +class CommunicationInfo: + + def __init__(self, name: str, data_list: list, is_task: bool): + self.comm_op_name = None + self.task_name = None + self.calls = None + self.total_duration = 0 + self.avg_duration = None + self.max_duration = None + self.min_duration = None + if data_list: + self.comm_op_name = "|" if is_task else name + self.task_name = name if is_task else None + self.calls = len(data_list) + self.total_duration = sum(data_list) + self.avg_duration = sum(data_list) / len(data_list) + self.max_duration = max(data_list) + self.min_duration = min(data_list) + + +class CommunicationBean: + TABLE_NAME = Constant.COMMUNICATION_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_comm_data: dict, comparison_comm_data: dict): + self._name = name + self._base_comm = base_comm_data + self._comparison_comm = comparison_comm_data + + @property + def rows(self): + rows = [] + base_comm = CommunicationInfo(self._name, self._base_comm.get("comm_list", []), is_task=False) + comparison_comm = CommunicationInfo(self._name, self._comparison_comm.get("comm_list", []), is_task=False) + rows.append(self._get_row(base_comm, comparison_comm, is_task=False)) + + base_task = self._base_comm.get("comm_task", {}) + comparison_task = self._comparison_comm.get("comm_task", {}) + if not base_task and not comparison_task: + return rows + + for task_name, task_list in base_task.items(): + base_task_info = CommunicationInfo(task_name, task_list, is_task=True) + comparison_task_info = CommunicationInfo("", [], is_task=True) + for _task_name, _task_list in comparison_task.items(): + comparison_task_info = CommunicationInfo(_task_name, _task_list, is_task=True) + comparison_task.pop(_task_name, None) + break + rows.append(self._get_row(base_task_info, comparison_task_info, is_task=True)) + for task_name, task_list in comparison_task.items(): + base_task_info = CommunicationInfo("", [], is_task=True) + comparison_task_info = CommunicationInfo(task_name, task_list, is_task=True) + rows.append(self._get_row(base_task_info, comparison_task_info, is_task=True)) + + return rows + + @classmethod + def _get_row(cls, base_info: CommunicationInfo, comparison_info: CommunicationInfo, is_task: bool) -> list: + row = [None, base_info.comm_op_name, base_info.task_name, base_info.calls, base_info.total_duration, + base_info.avg_duration, base_info.max_duration, base_info.min_duration, comparison_info.comm_op_name, + comparison_info.task_name, comparison_info.calls, comparison_info.total_duration, + comparison_info.avg_duration, comparison_info.max_duration, comparison_info.min_duration] + diff_fields = [None, None] if is_task else calculate_diff_ratio(base_info.total_duration, + comparison_info.total_duration) + row.extend(diff_fields) + return row diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/memory_compare_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/memory_compare_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..e1baa175311ae42765757feb8b13bbb3918c3727 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/memory_compare_bean.py @@ -0,0 +1,47 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.torch_op_node import TorchOpNode +from compare_backend.utils.tree_builder import TreeBuilder + + +class MemoryCompareBean: + TABLE_NAME = Constant.MEMORY_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, index: int, base_op: TorchOpNode, comparison_op: TorchOpNode): + self._index = index + self._base_op = MemoryInfo(base_op) + self._comparison_op = MemoryInfo(comparison_op) + + @property + def row(self): + row = [self._index + 1, self._base_op.operator_name, self._base_op.input_shape, self._base_op.input_type, + self._base_op.memory_details, self._base_op.size, self._comparison_op.operator_name, + self._comparison_op.input_shape, self._comparison_op.input_type, self._comparison_op.memory_details, + self._comparison_op.size] + diff_fields = calculate_diff_ratio(self._base_op.size, self._comparison_op.size) + row.extend(diff_fields) + return row + + +class MemoryInfo: + def __init__(self, torch_op: TorchOpNode): + self.operator_name = None + self.input_shape = None + self.input_type = None + self.size = 0 + self.memory_details = "" + self._memory_list = [] + if torch_op: + self.operator_name = torch_op.name + self.input_shape = torch_op.input_shape + self.input_type = torch_op.input_type + self._memory_list = TreeBuilder.get_total_memory(torch_op) + self._update_memory_fields() + + def _update_memory_fields(self): + for memory in self._memory_list: + self.size += memory.size + self.memory_details += memory.memory_details diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/memory_statistic_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/memory_statistic_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..9ccc2cb76da9158355aacb0994a1b66c0be97fb5 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/memory_statistic_bean.py @@ -0,0 +1,38 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.tree_builder import TreeBuilder +from compare_backend.utils.excel_config import ExcelConfig + + +class MemoryStatisticBean: + TABLE_NAME = Constant.MEMORY_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._name = name + self._base_info = MemoryStatisticInfo(base_data) + self._comparison_info = MemoryStatisticInfo(comparison_data) + + @property + def row(self): + row = [None, self._name, self._base_info.duration_ms, self._base_info.size_mb, self._base_info.number, + self._comparison_info.duration_ms, self._comparison_info.size_mb, self._comparison_info.number] + diff_fields = calculate_diff_ratio(self._base_info.size_mb, self._comparison_info.size_mb) + row.extend(diff_fields) + return row + + +class MemoryStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.duration_ms = 0 + self.size_mb = 0 + self.number = len(data_list) + self._get_info() + + def _get_info(self): + for op_data in self._data_list: + memory_list = TreeBuilder.get_total_memory(op_data) + self.duration_ms += sum([memory.duration / Constant.US_TO_MS for memory in memory_list]) + self.size_mb += sum([memory.size / Constant.KB_TO_MB for memory in memory_list]) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/module_compare_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/module_compare_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..abfce00d83d6c1a914aa71481277e2dc1c195f17 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/module_compare_bean.py @@ -0,0 +1,83 @@ +from compare_backend.utils.common_func import longest_common_subsequence_matching, calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.name_function import NameFunction +from compare_backend.utils.torch_op_node import TorchOpNode + + +class ModuleCompareBean: + TABLE_NAME = Constant.MODULE_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, base_module: ModuleNode, comparison_module: ModuleNode): + self._base_module = ModuleInfo(base_module) + self._comparison_module = ModuleInfo(comparison_module) + self.module_class = self._base_module.module_class if base_module else self._comparison_module.module_class + self.module_level = self._base_module.module_level if base_module else self._comparison_module.module_level + self.module_name = self._base_module.module_name if base_module else self._comparison_module.module_name + + @property + def rows(self): + return [self.get_total_row(), *self.get_detail_rows()] + + def get_total_row(self): + total_diff, total_ratio = calculate_diff_ratio(self._base_module.device_total_time, + self._comparison_module.device_total_time) + self_diff, _ = calculate_diff_ratio(self._base_module.device_self_time, + self._comparison_module.device_self_time) + return [None, self.module_class, self.module_level, self.module_name, "TOTAL", None, + self._base_module.device_self_time, self._base_module.device_total_time, "TOTAL", None, + self._comparison_module.device_self_time, self._comparison_module.device_total_time, total_diff, + self_diff, total_ratio, self._base_module.call_stack, self._comparison_module.call_stack] + + def get_detail_rows(self): + rows = [] + matched_ops = longest_common_subsequence_matching(self._base_module.top_layer_ops, + self._comparison_module.top_layer_ops, NameFunction.get_name) + for base_op, comparison_op in matched_ops: + base_op = OpInfo(base_op) + comparison_op = OpInfo(comparison_op) + self_diff, self_ratio = calculate_diff_ratio(base_op.device_self_time, comparison_op.device_self_time) + base_call_stack = base_op.call_stack if self_diff > 0 else None + comparison_call_stack = comparison_op.call_stack if self_diff > 0 else None + rows.append( + [None, self.module_class, self.module_level, self.module_name, base_op.operator_name, + base_op.kernel_details, base_op.device_self_time, None, comparison_op.operator_name, + comparison_op.kernel_details, comparison_op.device_self_time, None, None, self_diff, self_ratio, + base_call_stack, comparison_call_stack]) + return rows + + +class ModuleInfo: + def __init__(self, module: ModuleNode): + self.module_class = "" + self.module_level = "" + self.module_name = "" + self.device_self_time = 0 + self.device_total_time = 0 + self.top_layer_ops = [] + self.call_stack = "" + if module: + self.module_class = module.module_class + self.module_level = module.module_level + self.module_name = module.module_name.replace("nn.Module:", "") + self.device_self_time = module.device_self_dur + self.device_total_time = module.device_total_dur + self.top_layer_ops = module.toy_layer_api_list + self.call_stack = module.call_stack + + +class OpInfo: + def __init__(self, operator: TorchOpNode): + self.operator_name = "" + self.kernel_details = "" + self.device_self_time = 0 + self.call_stack = "" + if operator: + self.operator_name = operator.name + for kernel in operator.kernel_list: + self.device_self_time += kernel.device_dur + self.kernel_details += kernel.kernel_details + self.call_stack = operator.call_stack diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/module_statistic_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/module_statistic_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..97fc98bdd354e1ebe1fbb3fc44def4eaf3059235 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/module_statistic_bean.py @@ -0,0 +1,98 @@ +import re + +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class ModuleStatisticBean: + TABLE_NAME = Constant.MODULE_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._module_name = name.replace("nn.Module:", "") + pattern = re.compile('_[0-9]+$') + self._module_class = pattern.sub('', name.split("/")[-1]) + self._module_level = name.count("/") + self._base_info = ModuleStatisticInfo(base_data) + self._comparison_info = ModuleStatisticInfo(comparison_data) + + @property + def rows(self): + rows = [self.get_total_row()] + rows.extend(self.get_detail_rows()) + return rows + + @staticmethod + def _get_kernel_detail_rows(base_kernel_dict, com_kernel_dict): + base_kernel_detals = "" + com_kernel_details = "" + for kernel_name, base_dur_list in base_kernel_dict.items(): + base_dur = "%.3f" % sum(base_dur_list) + base_kernel_detals += f"{kernel_name}, [number: {len(base_dur_list)}], [duration_ms: {base_dur}]\n" + for kernel_name, com_dur_list in com_kernel_dict.items(): + com_dur = "%.3f" % sum(com_dur_list) + com_kernel_details += f"{kernel_name}, [number: {len(com_dur_list)}], [duration_ms: {com_dur}]\n" + return [base_kernel_detals, com_kernel_details] + + def get_total_row(self): + total_diff, total_ratio = calculate_diff_ratio(self._base_info.device_total_dur_ms, + self._comparison_info.device_total_dur_ms) + self_diff, _ = calculate_diff_ratio(self._base_info.device_self_dur_ms, + self._comparison_info.device_self_dur_ms) + row = [None, self._module_class, self._module_level, self._module_name, "[ TOTAL ]", None, + self._base_info.device_self_dur_ms, self._base_info.number, self._base_info.device_total_dur_ms, + None, self._comparison_info.device_self_dur_ms, self._comparison_info.number, + self._comparison_info.device_total_dur_ms, total_diff, self_diff, + total_ratio, self._base_info.call_stack, self._comparison_info.call_stack] + return row + + def get_detail_rows(self): + rows = [] + for op_name, base_dur_dict in self._base_info.api_dict.items(): + base_dur_list = base_dur_dict.get("total", []) + com_dur_dict = self._comparison_info.api_dict.pop(op_name, {}) + com_dur_list = com_dur_dict.get("total", []) + base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows(base_dur_dict.get("detail", {}), + com_dur_dict.get("detail", {})) + self_diff, self_ratio = calculate_diff_ratio(sum(base_dur_list), sum(com_dur_list)) + row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, + sum(base_dur_list), len(base_dur_list), None, com_kernel_details, sum(com_dur_list), + len(com_dur_list), None, None, self_diff, self_ratio, None, None] + rows.append(row) + + for op_name, com_dur_dict in self._comparison_info.api_dict.items(): + com_dur_list = com_dur_dict.get("total", []) + base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows({}, com_dur_dict.get("detail", {})) + self_diff, self_ratio = calculate_diff_ratio(0, sum(com_dur_list)) + row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, 0, 0, + None, com_kernel_details, sum(com_dur_list), len(com_dur_list), None, None, self_diff, + self_ratio, None, None] + rows.append(row) + return rows + + +class ModuleStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.device_self_dur_ms = 0 + self.device_total_dur_ms = 0 + self.call_stack = "" + self.number = len(data_list) + self.api_dict = {} + self._get_info() + + def _get_info(self): + if self._data_list: + self.call_stack = self._data_list[0].call_stack + for module in self._data_list: + self.device_self_dur_ms += module.device_self_dur / Constant.US_TO_MS + self.device_total_dur_ms += module.device_total_dur / Constant.US_TO_MS + for torch_op in module.toy_layer_api_list: + self.api_dict.setdefault(torch_op.name, {}).setdefault("total", []).append( + torch_op.device_dur / Constant.US_TO_MS) + for kernel in torch_op.kernel_list: + self.api_dict.setdefault(torch_op.name, {}).setdefault("detail", {}).setdefault(kernel.kernel_name, + []).append( + kernel.device_dur / Constant.US_TO_MS) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/operator_compare_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/operator_compare_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..e7ecfedddd7c2f5dd33664b1556a7b0245e295d1 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/operator_compare_bean.py @@ -0,0 +1,47 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.torch_op_node import TorchOpNode +from compare_backend.utils.tree_builder import TreeBuilder + + +class OperatorCompareBean: + TABLE_NAME = Constant.OPERATOR_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, index: int, base_op: TorchOpNode, comparison_op: TorchOpNode): + self._index = index + self._base_op = OperatorInfo(base_op) + self._comparison_op = OperatorInfo(comparison_op) + + @property + def row(self): + row = [self._index + 1, self._base_op.operator_name, self._base_op.input_shape, self._base_op.input_type, + self._base_op.kernel_details, self._base_op.device_dur, self._comparison_op.operator_name, + self._comparison_op.input_shape, self._comparison_op.input_type, self._comparison_op.kernel_details, + self._comparison_op.device_dur] + diff_fields = calculate_diff_ratio(self._base_op.device_dur, self._comparison_op.device_dur) + row.extend(diff_fields) + return row + + +class OperatorInfo: + def __init__(self, torch_op: TorchOpNode): + self.operator_name = None + self.input_shape = None + self.input_type = None + self.device_dur = 0 + self.kernel_details = "" + self._kernel_list = [] + if torch_op: + self.operator_name = torch_op.name + self.input_shape = torch_op.input_shape + self.input_type = torch_op.input_type + self._kernel_list = TreeBuilder.get_total_kernels(torch_op) + self._update_kernel_fields() + + def _update_kernel_fields(self): + for kernel in self._kernel_list: + self.device_dur += kernel.device_dur + self.kernel_details += kernel.kernel_details diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/operator_statistic_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/operator_statistic_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..457ae55acbd275dcf3e2f3c584114af8b9d55d17 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/operator_statistic_bean.py @@ -0,0 +1,36 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.tree_builder import TreeBuilder + + +class OperatorStatisticBean: + TABLE_NAME = Constant.OPERATOR_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._name = name + self._base_info = OperatorStatisticInfo(base_data) + self._comparison_info = OperatorStatisticInfo(comparison_data) + + @property + def row(self): + row = [None, self._name, self._base_info.device_dur_ms, self._base_info.number, + self._comparison_info.device_dur_ms, self._comparison_info.number] + diff_fields = calculate_diff_ratio(self._base_info.device_dur_ms, self._comparison_info.device_dur_ms) + row.extend(diff_fields) + return row + + +class OperatorStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.device_dur_ms = 0 + self.number = len(data_list) + self._get_info() + + def _get_info(self): + for op_data in self._data_list: + kernel_list = TreeBuilder.get_total_kernels(op_data) + self.device_dur_ms += sum([kernel.device_dur / Constant.US_TO_MS for kernel in kernel_list]) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/__init__.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/compare_event.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/compare_event.py new file mode 100644 index 0000000000000000000000000000000000000000..463e82430896923a8d21c44ab9e6f9b952855a84 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/compare_event.py @@ -0,0 +1,79 @@ +from decimal import Decimal + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.constant import Constant + + +class KernelEvent: + def __init__(self, event: TraceEventBean, device_type: str): + self._event = event + self._device_type = device_type + + @property + def kernel_name(self) -> str: + return self._event.name + + @property + def device_dur(self) -> float: + return self._event.dur + + @property + def task_id(self) -> int: + return self._event.task_id + + @property + def task_type(self) -> str: + return self._event.task_type + + @property + def kernel_details(self): + if self._device_type == Constant.GPU: + return f"{self.kernel_name} [duration: {self.device_dur}]\n" + return f"{self.kernel_name}, {self.task_id}, {self.task_type} [duration: {self.device_dur}]\n" + + +class MemoryEvent: + def __init__(self, event: dict): + self._event = event + self._name = "" + self._size = 0.0 + self._ts = Decimal(0) + self._release_time = Decimal(0) + self._allocation_time = Decimal(0) + self._duration = 0.0 + self.init() + + @property + def size(self) -> float: + return self._size + + @property + def duration(self) -> float: + return self._duration + + @property + def memory_details(self) -> str: + name = self._event.get(Constant.NAME, "") or self._name + return f"{name}, ({self._allocation_time}, {self._release_time}), " \ + f"[duration: {self._duration}], [size: {self._size}]\n" + + @property + def is_torch_op(self) -> bool: + return False + + @property + def start_time(self) -> Decimal: + return self._ts + + def set_name(self, name: str): + self._name = name + + def init(self): + self._size = self._event.get(Constant.SIZE, 0) + self._ts = self._event.get(Constant.TS, 0) + self._release_time = self._event.get(Constant.RELEASE_TIME) + self._allocation_time = self._event.get(Constant.ALLOCATION_TIME) + if not self._release_time or not self._allocation_time: + self._duration = 0.0 + else: + self._duration = float(self._release_time - self._allocation_time) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..122009b9045074c908c33dc50fffd36f03eb4ff9 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py @@ -0,0 +1,87 @@ +import math + +import pandas as pd + +from compare_backend.utils.common_func import convert_to_float +from compare_backend.utils.constant import Constant + + +class KernelDetailsBean: + def __init__(self, data: dict): + self._data = data + self._op_type = "" + self._name = "" + self._aiv_vec_time = 0.0 + self._mac_time = 0.0 + self._duration = 0.0 + self.init() + + @property + def op_type(self) -> str: + return self._op_type + + @property + def name(self) -> str: + return self._name + + @property + def aiv_vec_time(self) -> float: + if self._aiv_vec_time == "" or self._aiv_vec_time == "N/A": + return float("nan") + return convert_to_float(self._aiv_vec_time) + + @property + def mac_time(self) -> float: + if self._mac_time == "" or self._mac_time == "N/A": + return float("nan") + return convert_to_float(self._mac_time) + + @property + def duration(self) -> float: + return convert_to_float(self._duration) + + def is_hide_op_pmu(self): + if "mac_time(us)" in self._data.keys() or "aiv_vec_time(us)" in self._data.keys(): + return False + return True + + def is_vector(self): + if not pd.isna(self.aiv_vec_time) and self.aiv_vec_time > 0: + return True + if not pd.isna(self.mac_time) and math.isclose(self.mac_time, 0.0): + return True + return False + + def is_invalid(self): + if pd.isna(self.aiv_vec_time) and pd.isna(self.mac_time): + return True + return False + + def is_fa_bwd(self): + return 'bwd' in self.op_type.lower() or 'grad' in self.op_type.lower() + + def is_sdma(self): + return self.name.lower().startswith("aclnninplacecopy") and "tensormove" in self.name.lower() + + def is_flash_attention(self): + return "flashattention" in self.op_type.lower() + + def is_cube(self): + return "matmul" in self.op_type.lower() + + def is_conv(self): + return self.op_type.lower().startswith("conv") + + def is_conv_bwd(self): + lower_op_type = self.op_type.lower() + return any(bwd in lower_op_type for bwd in Constant.BWD_LIST) + + def is_page_attention(self): + return "pagedattention" in self.op_type.lower() + + def init(self): + self._op_type = self._data.get('Type', "") + self._name = self._data.get('Name', "") + self._aiv_vec_time = self._data.get('aiv_vec_time(us)', "") + self._mac_time = self._data.get('mac_time(us)', "") + self._duration = self._data.get('Duration(us)', 0) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/memory_record_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/memory_record_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..50d14089fe95f2dbc8e97788d80e0644306f671e --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/memory_record_bean.py @@ -0,0 +1,15 @@ +from compare_backend.utils.common_func import convert_to_float + + +class MemoryRecordBean: + def __init__(self, data: dict): + self._data = data + self._total_reserved_mb = 0.0 + self.init() + + @property + def total_reserved_mb(self) -> float: + return convert_to_float(self._total_reserved_mb) + + def init(self): + self._total_reserved_mb = self._data.get("Total Reserved(MB)", 0) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..254b8629cdc1941ac46da9b47419a4c675718375 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py @@ -0,0 +1,43 @@ +from decimal import Decimal + +from compare_backend.utils.common_func import convert_to_float, convert_to_decimal + + +class OperatorMemoryBean: + + def __init__(self, data: dict): + self._data = data + self._name = "" + self._size = 0.0 + self._allocation_time = Decimal(0) + self._release_time = Decimal(0) + self.init() + + @property + def name(self) -> str: + return self._name + + @property + def size(self) -> float: + return convert_to_float(self._size) + + @property + def allocation_time(self) -> Decimal: + if not self._allocation_time: + return Decimal(0) + return convert_to_decimal(self._allocation_time) + + @property + def release_time(self) -> Decimal: + if not self._release_time: + return Decimal(0) + return convert_to_decimal(self._release_time) + + def init(self): + self._name = self._data.get("Name", "") + self._size = self._data.get("Size(KB)", 0) + self._allocation_time = self._data.get("Allocation Time(us)", 0) + self._release_time = self._data.get("Release Time(us)", 0) + + def is_cann_op(self): + return "cann::" in self._name diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..cef6bb071243264c792e74f562e058ca1d8df7a1 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py @@ -0,0 +1,216 @@ +from decimal import Decimal + +from compare_backend.utils.common_func import convert_to_float, convert_to_decimal +from compare_backend.utils.constant import Constant + + +class TraceEventBean: + + def __init__(self, event: dict): + self._event = event + self._pid = 0 + self._tid = 0 + self._ts = Decimal(0) + self._dur = 0.0 + self._ph = "" + self._cat = "" + self._name = "" + self._args = {} + self._is_torch_op = False + self.init() + + @property + def pid(self) -> int: + return self._pid + + @property + def tid(self) -> int: + return self._tid + + @property + def dur(self) -> float: + return convert_to_float(self._dur) + + @property + def start_time(self) -> Decimal: + return convert_to_decimal(self._ts) + + @property + def end_time(self) -> Decimal: + return self.start_time + convert_to_decimal(self._dur) + + @property + def name(self) -> str: + return self._name + + @property + def lower_name(self) -> str: + return self._name.lower() + + @property + def lower_cat(self) -> str: + return self._cat.lower() + + @property + def args(self) -> dict: + return self._args + + @property + def id(self) -> str: + return self._event.get("id") + + @property + def stream_id(self) -> int: + return self._args.get('Stream Id') + + @property + def stream(self) -> int: + return self._args.get("stream") + + @property + def task_type(self) -> str: + return self._args.get('Task Type') + + @property + def task_id(self) -> int: + return self._args.get('Task Id') + + @property + def device_id(self) -> int: + try: + return int(self._args.get('Device Id', Constant.INVALID_VALUE)) + except Exception: + return Constant.INVALID_VALUE + + @property + def total_reserved(self): + return self._args.get('Total Reserved', 0) + + @property + def corr_id(self) -> int: + return self._args.get('correlation_id') + + @property + def process_name(self) -> int: + return self._args.get("name", "") + + @property + def bytes_kb(self) -> int: + return self._args.get("Bytes", 0) / Constant.BYTE_TO_KB + + @property + def addr(self) -> str: + return self._args.get("Addr") + + @property + def event(self) -> dict: + return self._event + + @property + def is_torch_op(self) -> bool: + return self._is_torch_op + + @is_torch_op.setter + def is_torch_op(self, value: bool): + self._is_torch_op = value + + def is_m_mode(self) -> bool: + return self._ph == "M" + + def is_x_mode(self) -> bool: + return self._ph == "X" + + def is_flow_start(self) -> bool: + return self._ph == "s" + + def is_flow_end(self) -> bool: + return self._ph == "f" + + def is_enqueue(self) -> bool: + return self.lower_cat == "enqueue" + + def is_dequeue(self) -> bool: + return self.lower_cat == "dequeue" + + def is_process_meta(self) -> bool: + return self.is_m_mode() and self._name == "process_name" + + def is_thread_meta(self) -> bool: + return self.is_m_mode() and self._name == "thread_name" + + def is_communication_op_thread(self) -> bool: + return self._args.get("name", "").find("Communication") != -1 + + def is_hccl_process_name(self) -> bool: + return self.process_name == "HCCL" + + def is_overlap_process_name(self) -> bool: + return self.process_name == "Overlap Analysis" + + def is_npu_process_name(self) -> bool: + return self.process_name == "Ascend Hardware" + + def is_computing_event(self): + return self._name == "Computing" + + def is_comm_not_overlap(self): + return self._name == 'Communication(Not Overlapped)' + + def is_dict(self): + return isinstance(self._event, dict) + + def is_kernel_cat(self): + return self.lower_cat == "kernel" + + def is_nccl_name(self): + return self.lower_name.startswith("nccl") + + def is_kernel_except_nccl(self): + return self.is_kernel_cat() and not self.is_nccl_name() + + def is_memory_event(self): + return self.lower_name == '[memory]' and self.device_id >= 0 + + def is_compute_event(self): + return self.task_type in ('AI_CORE', 'MIX_AIC', 'MIX_AIV', 'AI_CPU', 'AI_VECTOR_CORE', 'FFTS_PLUS') + + def is_sdma_event(self): + return self.task_type in ('SDMA_SQE', 'PCIE_DMA_SQE') + + def is_event_wait(self): + return self.task_type == 'EVENT_WAIT_SQE' + + def is_backward(self): + return any(bwd in self.lower_name for bwd in Constant.BWD_LIST) + + def is_python_function(self): + return self.lower_cat == "python_function" + + def is_optimizer(self): + return self.lower_name.startswith("optimizer") + + def is_fwdbwd(self): + return self.lower_cat == "fwdbwd" + + def is_step_profiler(self): + return self.name.find("ProfilerStep#") != -1 + + def reset_name(self, name): + self._name = name + + def is_conv(self): + return self.name.lower().startswith("aten::conv") + + def is_lccl(self): + return self.lower_name == "kernel_aivec" + + def init(self): + if isinstance(self._event, dict): + self._pid = self._event.get("pid", 0) + self._tid = self._event.get("tid", 0) + self._ts = self._event.get("ts", 0) + self._dur = self._event.get("dur", 0) + self._ph = self._event.get("ph", "") + self._cat = self._event.get("cat", "") + self._name = self._event.get("name", "") + self._args = self._event.get("args", {}) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools_review/compare_backend/compare_bean/profiling_info.py new file mode 100644 index 0000000000000000000000000000000000000000..e5d9bf26e985330d830ba6e01f62525fe88e43ea --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/profiling_info.py @@ -0,0 +1,128 @@ +from compare_backend.utils.constant import Constant + + +class ProfilingInfo: + TABLE_NAME = Constant.PERFORMANCE_TABLE + HEADERS = [] + OVERHEAD = [] + + def __init__(self, profiling_type: str): + self.profiling_type = profiling_type + self.cube_time = 0.0 + self.other_time = 0.0 + self.vec_time = 0.0 + self.cube_num = 0 + self.vec_num = 0 + self.sdma_num = 0 + self.fa_num_fwd = 0 + self.fa_num_bwd = 0 + self.pa_num = 0 + self.lccl_num = 0 + self.conv_time_fwd = 0.0 + self.conv_time_bwd = 0.0 + self.conv_num_fwd = 0 + self.conv_num_bwd = 0 + self.compute_time = 0.0 + self.communication_not_overlapped = 0.0 + self.wait_time = 0.0 + self.memory_used = 0.0 + self.e2e_time = 0.0 + self.sdma_time = 0.0 + self.scheduling_time = 0.0 + self.fa_time_bwd = 0.0 + self.pa_time = 0.0 + self.lccl_time = 0.0 + self.fa_time_fwd = 0.0 + self.minimal_profiling = False + self.hide_op_details = False + self.is_level0 = False + + def trans_time_to_s(self): + self.cube_time = self.cube_time / 10 ** 6 + self.other_time = self.other_time / 10 ** 6 + self.vec_time = self.vec_time / 10 ** 6 + self.compute_time = self.compute_time / 10 ** 6 + self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6 + self.wait_time = self.wait_time / 10 ** 6 + self.e2e_time = self.e2e_time / 10 ** 6 + self.sdma_time = self.sdma_time / 10 ** 6 + self.scheduling_time = self.scheduling_time / 10 ** 6 + self.fa_time_bwd = self.fa_time_bwd / 10 ** 6 + self.fa_time_fwd = self.fa_time_fwd / 10 ** 6 + self.pa_time = self.pa_time / 10 ** 6 + self.lccl_time = self.lccl_time / 10 ** 6 + self.conv_time_fwd = self.conv_time_fwd / 10 ** 6 + self.conv_time_bwd = self.conv_time_bwd / 10 ** 6 + + def calculate_other_time(self): + self.other_time = max( + [0, self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd - + self.pa_time - self.vec_time - self.conv_time_fwd - self.conv_time_bwd]) + + def calculate_vec_time(self): + self.vec_time = self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd \ + - self.conv_time_fwd - self.conv_time_bwd + + def calculate_schedule_time(self): + self.scheduling_time = (self.e2e_time - self.compute_time - self.lccl_time \ + - self.communication_not_overlapped) + + def update_fa_fwd_info(self, time: float): + self.fa_time_fwd += time + self.fa_num_fwd += 1 + + def update_fa_bwd_info(self, time: float): + self.fa_time_bwd += time + self.fa_num_bwd += 1 + + def update_pa_info(self, time: float): + self.pa_time += time + self.pa_num += 1 + + def update_lccl_info(self, time: float): + self.lccl_time += time + self.lccl_num += 1 + + def update_conv_fwd_info(self, time: float): + self.conv_time_fwd += time + self.conv_num_fwd += 1 + + def update_conv_bwd_info(self, time: float): + self.conv_time_bwd += time + self.conv_num_bwd += 1 + + def update_sdma_info(self, time: float, num: int = 1): + self.sdma_time += time + self.sdma_num += num + + def update_cube_info(self, time: float): + self.cube_time += time + self.cube_num += 1 + + def update_vec_info(self, time: float): + self.vec_time += time + self.vec_num += 1 + + def set_compute_time(self, time: float): + self.compute_time = time + + def update_compute_time(self, time: float): + self.compute_time += time + + def set_e2e_time(self, time: float): + self.e2e_time = time + + def set_comm_not_overlap(self, time: float): + self.communication_not_overlapped = time + + def update_comm_not_overlap(self, time: float): + self.communication_not_overlapped += time + + def update_comm_not_overlap_wait_time(self, time: float): + self.wait_time = time + + def set_memory_used(self, memory: float): + self.memory_used = memory + + def is_not_minimal_profiling(self) -> bool: + return self.profiling_type == Constant.NPU and not self.minimal_profiling diff --git a/profiler/compare_tools_review/compare_backend/comparison_generator.py b/profiler/compare_tools_review/compare_backend/comparison_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..b07170b648c44f8061fb1482bdd5d2d417cbcfaf --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparison_generator.py @@ -0,0 +1,44 @@ +from compare_backend.generator.detail_performance_generator import DetailPerformanceGenerator +from compare_backend.generator.overall_performance_generator import OverallPerformanceGenerator +from compare_backend.interface.overall_interface import OverallInterface +from compare_backend.profiling_parser.gpu_profiling_parser import GPUProfilingParser +from compare_backend.profiling_parser.npu_profiling_parser import NPUProfilingParser +from compare_backend.utils.constant import Constant +from compare_backend.utils.args_manager import ArgsManager + + +class ComparisonGenerator: + PARSER_DICT = {Constant.NPU: NPUProfilingParser, Constant.GPU: GPUProfilingParser} + INTERFACE_DICT = {Constant.OVERALL_COMPARE: OverallInterface} + + def __init__(self, args): + self._args_manager = ArgsManager() + self._args_manager.init(args) + self._data_dict = {} + + def run(self): + self.load_data() + self.generate_compare_result() + + def load_data(self): + self._data_dict[Constant.BASE_DATA] = self.PARSER_DICT.get(self._args_manager.base_profiling_type)( + self._args_manager.args, self._args_manager.base_path_dict).load_data() + self._data_dict[Constant.COMPARISON_DATA] = self.PARSER_DICT.get(self._args_manager.comparison_profiling_type)( + self._args_manager.args, self._args_manager.comparison_path_dict).load_data() + + def generate_compare_result(self): + overall_data = {Constant.BASE_DATA: self._data_dict.get(Constant.BASE_DATA).overall_metrics, + Constant.COMPARISON_DATA: self._data_dict.get(Constant.COMPARISON_DATA).overall_metrics} + generator_list = [OverallPerformanceGenerator(overall_data, self._args_manager.args), + DetailPerformanceGenerator(self._data_dict, self._args_manager.args)] + for generator in generator_list: + generator.start() + for generator in generator_list: + generator.join() + + def run_interface(self, compare_type: str) -> dict: + self.load_data() + interface = self.INTERFACE_DICT.get(compare_type) + if interface: + return interface(self._data_dict).run() + return {} diff --git a/profiler/compare_tools_review/compare_backend/data_prepare/__init__.py b/profiler/compare_tools_review/compare_backend/data_prepare/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/data_prepare/module_data_prepare.py b/profiler/compare_tools_review/compare_backend/data_prepare/module_data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..84932366dd9252bf2df068a6d9cc1cf1d0f9c440 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/data_prepare/module_data_prepare.py @@ -0,0 +1,99 @@ +import copy +from queue import Queue + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.constant import Constant +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.tree_builder import TreeBuilder + + +class ModuleDataPrepare: + def __init__(self, profiling_data: ProfilingResult): + self.profiling_data = profiling_data + self._nn_module_list = [] + self._call_function = [] + for event in profiling_data.python_function_data: + if event.lower_name.startswith("nn.module:"): + self._nn_module_list.append(event) + else: + self._call_function.append(event) + self._bwd_dict = {} + self._bwd_pid = self._get_bwd_pid() + + @staticmethod + def update_module_node_info(fwd_root_node, bwd_root_node, func_root_node): + queue = Queue() + queue.put(fwd_root_node) + queue.put(bwd_root_node) + while not queue.empty(): + module_node = queue.get() + module_node.update_torch_op_kernel_list() + call_function = func_root_node.find_module_call(module_node.start_time) + if call_function: + module_node.reset_call_stack(call_function.call_stack) + for sub_module_node in module_node.child_nodes: + queue.put(sub_module_node) + + def build_module_tree(self): + if not self._nn_module_list: + return [None, None] + self._dispatch_torch_op() + event_list = [TraceEventBean({"ts": ts}) for ts in self.profiling_data.kernel_dict.keys()] + self._nn_module_list.extend(event_list) + root_node = TreeBuilder.build_module_tree(self._nn_module_list, self.profiling_data.kernel_dict) + func_root_node = TreeBuilder.build_module_tree(self._call_function, {}) + bwd_module_list = self.get_bwd_module(root_node) + if bwd_module_list: + bwd_module_list.extend(event_list) + bwd_root_node = TreeBuilder.build_module_tree(bwd_module_list, self.profiling_data.kernel_dict) + self.match_torch_op(root_node, bwd_root_node) + self.update_module_node_info(root_node, bwd_root_node, func_root_node) + return [root_node, bwd_root_node] + + def get_bwd_module(self, root_node: ModuleNode): + bwd_module_list = [] + for flow in self.profiling_data.fwdbwd_dict.values(): + start_point = flow.get("start") + end_point = flow.get("end") + if not start_point or not end_point: + continue + end_event = self._bwd_dict.get(end_point.start_time) + if not end_event: + continue + call_module = root_node.find_module_call(start_point.start_time) + if call_module: + bwd_event = copy.deepcopy(end_event) + bwd_event.reset_name(f"[ BACKWARD ]{call_module.module_name}") + bwd_module_list.append(bwd_event) + return bwd_module_list + + def match_torch_op(self, fwd_root_node, bwd_root_node): + torch_op_list = sorted(self.profiling_data.torch_op_data, key=lambda x: x.start_time) + for torch_op in torch_op_list: + if torch_op.is_optimizer(): + continue + if torch_op.is_step_profiler(): + continue + matched_module = fwd_root_node.find_module_call(torch_op.start_time) + if matched_module: + matched_module.find_torch_op_call(torch_op) + continue + matched_module = bwd_root_node.find_module_call(torch_op.start_time) + if matched_module: + matched_module.find_torch_op_call(torch_op) + + def _dispatch_torch_op(self): + for torch_op in self.profiling_data.torch_op_data: + if torch_op.is_optimizer(): + self._nn_module_list.append(torch_op) + continue + if torch_op.pid == self._bwd_pid: + self._bwd_dict[torch_op.start_time] = torch_op + + def _get_bwd_pid(self): + for flow in self.profiling_data.fwdbwd_dict.values(): + end_point = flow.get("end") + if end_point: + return end_point.pid + return Constant.INVALID_VALUE diff --git a/profiler/compare_tools_review/compare_backend/data_prepare/operator_data_prepare.py b/profiler/compare_tools_review/compare_backend/data_prepare/operator_data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..fdce23c6ab4ff7f9f6f7d6bc1442063c57cb6098 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/data_prepare/operator_data_prepare.py @@ -0,0 +1,19 @@ +from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.tree_builder import TreeBuilder + + +class OperatorDataPrepare: + def __init__(self, profiling_data: ProfilingResult): + self.profiling_data = profiling_data + + def get_top_layer_ops(self) -> any: + root_node = TreeBuilder.build_tree(self.profiling_data.torch_op_data, self.profiling_data.kernel_dict, + self.profiling_data.memory_list) + level1_child_nodes = root_node.child_nodes + result_data = [] + for level1_node in level1_child_nodes: + if level1_node.is_step_profiler(): + result_data.extend(level1_node.child_nodes) + else: + result_data.append(level1_node) + return result_data diff --git a/profiler/compare_tools_review/compare_backend/disaggregate/__init__.py b/profiler/compare_tools_review/compare_backend/disaggregate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/disaggregate/overall_perf_interface.py b/profiler/compare_tools_review/compare_backend/disaggregate/overall_perf_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..c89e84519302781a590523bc7fdaaf9e1254acf5 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/disaggregate/overall_perf_interface.py @@ -0,0 +1,34 @@ +from common_func.path_manager import PathManager +from compare_backend.profiling_parser.gpu_profiling_parser import GPUProfilingParser +from compare_backend.profiling_parser.npu_profiling_parser import NPUProfilingParser +from compare_backend.utils.args_manager import ArgsManager +from compare_backend.utils.compare_args import Args +from compare_backend.utils.constant import Constant + + +class OverallPerfInterface: + PARSER_DICT = {Constant.NPU: NPUProfilingParser, Constant.GPU: GPUProfilingParser} + + def __init__(self, profiling_path: str): + self._profiling_path = profiling_path + self._profiling_path_dict = {} + self._result_data = {} + + def run(self): + self._check_path() + self._load_data() + self._generate_result() + return self._result_data + + def _check_path(self): + profiling_path = PathManager.get_realpath(self._profiling_path) + self._profiling_path_dict = ArgsManager().parse_profiling_path(profiling_path) + + def _load_data(self): + args = Args(enable_profiling_compare=True) + profiling_type = self._profiling_path_dict.get(Constant.PROFILING_TYPE, Constant.NPU) + self._profiling_data = self.PARSER_DICT.get(profiling_type)(args, self._profiling_path_dict).load_data() + + def _generate_result(self): + overall_data = self._profiling_data.overall_metrics + self._result_data = getattr(overall_data, "__dict__", {}) diff --git a/profiler/compare_tools_review/compare_backend/generator/__init__.py b/profiler/compare_tools_review/compare_backend/generator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/generator/base_generator.py b/profiler/compare_tools_review/compare_backend/generator/base_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..e77071b5998a9915d09c54f8b4c811d434555167 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/generator/base_generator.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +from collections import OrderedDict +from multiprocessing import Process + + +class BaseGenerator(Process, ABC): + def __init__(self, profiling_data_dict: dict, args: any): + super(BaseGenerator, self).__init__() + self._profiling_data_dict = profiling_data_dict + self._args = args + self._result_data = OrderedDict() + + def run(self): + self.compare() + self.generate_view() + + @abstractmethod + def compare(self): + raise NotImplementedError("Function compare need to be implemented.") + + @abstractmethod + def generate_view(self): + raise NotImplementedError("Function generate_view need to be implemented.") diff --git a/profiler/compare_tools_review/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools_review/compare_backend/generator/detail_performance_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..88ff7ac19699f7d08e61902088108c63fa78a6bf --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/generator/detail_performance_generator.py @@ -0,0 +1,145 @@ +import os +from collections import deque +from datetime import datetime +from queue import Queue + +from compare_backend.comparator.communication_comparator import CommunicationComparator +from compare_backend.comparator.module_comparetor import ModuleComparator +from compare_backend.comparator.module_statistic_comparator import ModuleStatisticComparator +from compare_backend.comparator.operator_comparator import OperatorComparator +from compare_backend.comparator.operator_statistic_comparator import OperatorStatisticComparator +from compare_backend.compare_bean.communication_bean import CommunicationBean +from compare_backend.compare_bean.memory_compare_bean import MemoryCompareBean +from compare_backend.compare_bean.memory_statistic_bean import MemoryStatisticBean +from compare_backend.compare_bean.module_compare_bean import ModuleCompareBean +from compare_backend.compare_bean.module_statistic_bean import ModuleStatisticBean +from compare_backend.compare_bean.operator_compare_bean import OperatorCompareBean +from compare_backend.compare_bean.operator_statistic_bean import OperatorStatisticBean +from compare_backend.data_prepare.module_data_prepare import ModuleDataPrepare +from compare_backend.data_prepare.operator_data_prepare import OperatorDataPrepare +from compare_backend.generator.base_generator import BaseGenerator +from compare_backend.utils.common_func import longest_common_subsequence_matching +from compare_backend.utils.constant import Constant +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.name_function import NameFunction +from compare_backend.utils.torch_op_node import TorchOpNode +from compare_backend.view.excel_view import ExcelView + + +class DetailPerformanceGenerator(BaseGenerator): + def __init__(self, profiling_data_dict: dict, args: any): + super().__init__(profiling_data_dict, args) + + def compare(self): + if self._args.enable_operator_compare or self._args.enable_memory_compare or \ + self._args.enable_communication_compare: + print("[INFO] Start to compare performance detail data, please wait.") + comparator_list = self._create_comparator() + for comparator in comparator_list: + self._result_data.update(comparator.generate_data()) + + def generate_view(self): + if not self._result_data: + return + dir_path = self._args.output_path if self._args.output_path else "./" + file_name = "performance_comparison_result_{}.xlsx".format(datetime.utcnow().strftime("%Y%m%d%H%M%S")) + result_file_path = os.path.realpath(os.path.join(dir_path, file_name)) + ExcelView(self._result_data, result_file_path, self._args).generate_view() + print(f"[INFO] The comparison result file has been generated: {result_file_path}") + + def _create_comparator(self): + comparator_list = [] + + op_compare_result = [] + if self._args.enable_operator_compare: + module_compare_result = self.match_nn_module() if self._profiling_data_dict.get( + Constant.BASE_DATA).python_function_data and self._profiling_data_dict.get( + Constant.COMPARISON_DATA).python_function_data else [] + if not module_compare_result: + op_compare_result = self.match_torch_op() + + if self._args.enable_memory_compare and not op_compare_result: + op_compare_result = self.match_torch_op() + + if self._args.enable_communication_compare: + communication_data = { + Constant.BASE_DATA: self._profiling_data_dict.get(Constant.BASE_DATA).communication_dict, + Constant.COMPARISON_DATA: self._profiling_data_dict.get(Constant.COMPARISON_DATA).communication_dict} + comparator_list.append(CommunicationComparator(communication_data, CommunicationBean)) + + if self._args.enable_operator_compare: + if module_compare_result: + comparator_list.append(ModuleStatisticComparator(module_compare_result, ModuleStatisticBean)) + comparator_list.append(ModuleComparator(module_compare_result, ModuleCompareBean)) + else: + comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean)) + comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean)) + if self._args.enable_memory_compare: + comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean)) + comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean)) + return comparator_list + + def match_torch_op(self) -> list: + base_ops = OperatorDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).get_top_layer_ops() + comparison_ops = OperatorDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).get_top_layer_ops() + if not base_ops and not comparison_ops: + return [] + name_func = NameFunction(self._args).get_name_func() + op_compare_result = longest_common_subsequence_matching(base_ops, comparison_ops, name_func) + if self._args.max_kernel_num is not None: + op_compare_result = self._drill_down(op_compare_result, name_func) + return op_compare_result + + def _drill_down(self, compare_result_data: list, name_func: any) -> list: + drill_down_result = [] + compare_result_data.reverse() + op_deque = deque(compare_result_data) + while op_deque: + match_data = op_deque.pop() + base_op = match_data[0] if match_data[0] else TorchOpNode() + comparison_op = match_data[1] if match_data[1] else TorchOpNode() + if not base_op.child_nodes or not comparison_op.child_nodes: + drill_down_result.append(match_data) + continue + if max(base_op.kernel_num, comparison_op.kernel_num) <= self._args.max_kernel_num: + drill_down_result.append(match_data) + continue + match_list = longest_common_subsequence_matching(base_op.child_nodes, comparison_op.child_nodes, name_func) + match_list.reverse() + for data in match_list: + op_deque.append(data) + + return drill_down_result + + def match_nn_module(self) -> list: + module_compare_result = [] + base_root_node = ModuleDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).build_module_tree() + comparison_root_node = ModuleDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).build_module_tree() + for index, base_node in enumerate(base_root_node): + comparison_node = comparison_root_node[index] if index < len(comparison_root_node) else None + if not base_node or not comparison_node: + continue + module_compare_result.extend(self._matching_all_modules(base_node, comparison_node)) + return module_compare_result + + def _matching_all_modules(self, base_node: ModuleNode, comparison_node: ModuleNode): + all_matched_modules = [] + matched_queue = Queue() + matched_queue.put([base_node, comparison_node]) + while not matched_queue.empty(): + matched_base_node, matched_comparison_node = matched_queue.get() + matched_node_list = self._matching_common_subsequence(matched_base_node, matched_comparison_node) + all_matched_modules.extend(matched_node_list) + for matched_node in matched_node_list: + matched_queue.put(matched_node) + return all_matched_modules + + def _matching_common_subsequence(self, base_node: ModuleNode, comparison_node: ModuleNode): + base_modules = base_node.child_nodes if base_node else [] + comparison_modules = comparison_node.child_nodes if comparison_node else [] + if not base_modules and not comparison_modules: + return [] + name_func = NameFunction(self._args).get_module_name + return longest_common_subsequence_matching(base_modules, comparison_modules, name_func) diff --git a/profiler/compare_tools_review/compare_backend/generator/overall_performance_generator.py b/profiler/compare_tools_review/compare_backend/generator/overall_performance_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..9fe31d0ea54b5e4eaa38239a7b825d1b2e80b00f --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/generator/overall_performance_generator.py @@ -0,0 +1,19 @@ +from compare_backend.comparator.overall_performance_comparator import OverallPerformanceComparator +from compare_backend.compare_bean.profiling_info import ProfilingInfo +from compare_backend.generator.base_generator import BaseGenerator +from compare_backend.view.screen_view import ScreenView + + +class OverallPerformanceGenerator(BaseGenerator): + def __init__(self, profiling_data_dict: dict, args: any): + super().__init__(profiling_data_dict, args) + + def compare(self): + if not self._args.enable_profiling_compare: + return + self._result_data = OverallPerformanceComparator(self._profiling_data_dict, ProfilingInfo).generate_data() + + def generate_view(self): + if not self._result_data: + return + ScreenView(self._result_data).generate_view() diff --git a/profiler/compare_tools_review/compare_backend/interface/__init__.py b/profiler/compare_tools_review/compare_backend/interface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/interface/overall_interface.py b/profiler/compare_tools_review/compare_backend/interface/overall_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..fb549007f634610d1c954ef132c416a5c2606541 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/interface/overall_interface.py @@ -0,0 +1,13 @@ +from compare_backend.comparator.overall_performance_comparator import OverallPerformanceComparator +from compare_backend.compare_bean.profiling_info import ProfilingInfo +from compare_backend.utils.constant import Constant + + +class OverallInterface: + def __init__(self, overall_data: dict): + self._overall_data = overall_data + + def run(self): + data = {Constant.BASE_DATA: self._overall_data.get(Constant.BASE_DATA).overall_metrics, + Constant.COMPARISON_DATA: self._overall_data.get(Constant.COMPARISON_DATA).overall_metrics} + return OverallPerformanceComparator(data, ProfilingInfo).generate_data() diff --git a/profiler/compare_tools_review/compare_backend/profiling_parser/__init__.py b/profiler/compare_tools_review/compare_backend/profiling_parser/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/profiling_parser/base_profiling_parser.py b/profiler/compare_tools_review/compare_backend/profiling_parser/base_profiling_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..2127ff5e75e23e98f0debb0dfdafbeb01930c082 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/profiling_parser/base_profiling_parser.py @@ -0,0 +1,211 @@ +from abc import abstractmethod, ABC +from decimal import Decimal + +from compare_backend.compare_bean.origin_data_bean.compare_event import KernelEvent, MemoryEvent +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.compare_bean.profiling_info import ProfilingInfo +from compare_backend.utils.constant import Constant +from compare_backend.utils.file_reader import FileReader + + +class ProfilingResult: + + def __init__(self, profiling_type): + self._profiling_type = profiling_type + self.torch_op_data = [] + self.kernel_dict = {} + self.memory_list = [] + self.communication_dict = {} + self.overall_metrics = ProfilingInfo(profiling_type) + self.python_function_data = [] + self.fwdbwd_dict = {} + + def update_torch_op_data(self, event: TraceEventBean): + event.is_torch_op = True + self.torch_op_data.append(event) + + def update_python_function_data(self, event: TraceEventBean): + self.python_function_data.append(event) + + def update_fwdbwd_data(self, flow_type: str, event: TraceEventBean): + self.fwdbwd_dict.setdefault(event.id, {})[flow_type] = event + + def update_kernel_dict(self, start_time: Decimal, kernel_event: TraceEventBean): + self.kernel_dict.setdefault(start_time, []).append(KernelEvent(kernel_event, self._profiling_type)) + + def update_memory_list(self, memory_data: dict): + self.memory_list.append(MemoryEvent(memory_data)) + + def update_communication_dict(self, comm_name: str, comm_dur: float): + self.communication_dict.setdefault(comm_name, {}).setdefault("comm_list", []).append(comm_dur) + + def update_comm_task_data(self, comm_name: str, task_event: TraceEventBean): + self.communication_dict.setdefault(comm_name, {}).setdefault("comm_task", {}).setdefault( + task_event.name, []).append(task_event.dur) + + +class BaseProfilingParser(ABC): + + def __init__(self, args: any, path_dict: dict): + self._args = args + self._profiling_type = path_dict.get(Constant.PROFILING_TYPE) + self._profiling_path = path_dict.get(Constant.PROFILING_PATH) + self._json_path = path_dict.get(Constant.TRACE_PATH) + self._trace_events = [] if self._profiling_path == Constant.NPU else {} + self._enable_profiling_compare = args.enable_profiling_compare + self._enable_operator_compare = args.enable_operator_compare + self._enable_memory_compare = args.enable_memory_compare + self._enable_communication_compare = args.enable_communication_compare + self._dispatch_func = self._get_dispatch_func() + self._result_data = ProfilingResult(self._profiling_type) + self._memory_events = [] + self._flow_dict = {} + self._fwdbwd_dict = {} + self._all_kernels = {} + self._comm_task_list = [] + self._comm_list = [] + self._read_trace_event() + self._cur_func_index = 0 + + @abstractmethod + def _update_memory_list(self): + raise NotImplementedError("Function _update_memory_list need to be implemented.") + + @abstractmethod + def _update_overall_metrics(self): + raise NotImplementedError("Function _update_overall_metrics need to be implemented.") + + @abstractmethod + def _is_kernel_event(self, event: TraceEventBean): + raise NotImplementedError("Function _is_kernel_event need to be implemented.") + + @abstractmethod + def _is_flow_event(self, event: TraceEventBean): + raise NotImplementedError("Function _is_flow_event need to be implemented.") + + @abstractmethod + def _is_torch_op_event(self, event: TraceEventBean): + raise NotImplementedError("Function _is_torch_op_event need to be implemented.") + + @abstractmethod + def _get_dispatch_func(self): + raise NotImplementedError("Function _get_dispatch_func need to be implemented.") + + def load_data(self) -> ProfilingResult: + self._dispatch_events() + self._update_kernel_dict() + self._update_communication_dict() + if self._enable_memory_compare: + self._update_memory_list() + if self._enable_profiling_compare: + self._update_overall_metrics() + self._check_result_data() + return self._result_data + + def _dispatch_events(self): + if not self._dispatch_func: + return + index_list = list(range(0, len(self._dispatch_func))) * 2 + for event in self._trace_events: + if not event.is_dict(): + continue + if event.is_m_mode(): + continue + self.__picking_event(event, index_list) + + def __picking_event(self, event: TraceEventBean, index_list: list): + for index in range(self._cur_func_index, self._cur_func_index + len(self._dispatch_func)): + func_index = index_list[index] + res = self._dispatch_func[func_index](event) + if res: + self._cur_func_index = func_index + break + + def _picking_torch_op_event(self, event: TraceEventBean): + if self._is_torch_op_event(event): + self._result_data.update_torch_op_data(event) + return True + return False + + def _picking_kernel_event(self, event: TraceEventBean): + if self._is_kernel_event(event): + self._all_kernels[f"{event.pid}-{event.tid}-{event.start_time}"] = event + return True + return False + + def _picking_flow_event(self, event: TraceEventBean): + if self._is_flow_event(event): + if event.is_flow_start(): + self._flow_dict.setdefault(event.id, {})["start"] = event + elif event.is_flow_end(): + self._flow_dict.setdefault(event.id, {})["end"] = event + return True + return False + + def _picking_python_function_event(self, event: TraceEventBean): + if event.is_python_function(): + self._result_data.update_python_function_data(event) + return True + return False + + def _picking_fwdbwd_flow_event(self, event: TraceEventBean): + if event.is_fwdbwd(): + if event.is_flow_start(): + self._result_data.update_fwdbwd_data("start", event) + elif event.is_flow_end(): + self._result_data.update_fwdbwd_data("end", event) + return True + return False + + def _update_kernel_dict(self): + if self._profiling_type == Constant.NPU: + for comm in self._comm_list: + self._all_kernels[f"{comm.pid}-{comm.tid}-{comm.start_time}"] = comm + for flow_event in self._flow_dict.values(): + start_event = flow_event.get("start") + end_event = flow_event.get("end") + if not start_event or not end_event: + continue + kernel_event = self._all_kernels.get(f"{end_event.pid}-{end_event.tid}-{end_event.start_time}") + if not kernel_event: + continue + self._result_data.update_kernel_dict(start_event.start_time, kernel_event) + + def _update_communication_dict(self): + if self._profiling_type == Constant.GPU: + self._comm_list = list(filter(lambda x: x.is_nccl_name(), self._all_kernels.values())) + self._comm_list.sort(key=lambda x: x.start_time) + self._comm_task_list.sort(key=lambda x: x.start_time) + task_index = 0 + for communication_op in self._comm_list: + name_list = communication_op.lower_name.split("_") + if len(name_list) < 2: + continue + comm_name = name_list[1] + self._result_data.update_communication_dict(comm_name, communication_op.dur) + while task_index < len(self._comm_task_list): + task_event = self._comm_task_list[task_index] + if task_event.start_time < communication_op.start_time: + task_index += 1 + continue + if task_event.start_time > communication_op.end_time: + break + self._result_data.update_comm_task_data(comm_name, task_event) + task_index += 1 + + def _check_result_data(self): + if self._enable_operator_compare or self._enable_memory_compare: + if not self._result_data.torch_op_data: + print(f"[WARNING] Can't find any torch op in the file: {self._profiling_path}") + if self._enable_operator_compare and not self._result_data.kernel_dict: + print(f"[WARNING] Can't find any flow event in the file: {self._profiling_path}") + if self._enable_memory_compare and not self._result_data.memory_list: + print(f"[WARNING] Can't find any memory event in the file: {self._profiling_path}") + if self._enable_communication_compare and not self._result_data.communication_dict: + print(f"[WARNING] Can't find any communication op in the file: {self._profiling_path}") + + def _read_trace_event(self): + try: + self._trace_events = FileReader.read_trace_file(self._json_path) + except Exception: + print(f"[ERROR] Failed to read the file: {self._json_path}") diff --git a/profiler/compare_tools_review/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools_review/compare_backend/profiling_parser/gpu_profiling_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..c4089aec9bdcb35b80ae9ff9121fcd75bde3a63e --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/profiling_parser/gpu_profiling_parser.py @@ -0,0 +1,189 @@ +import sys +from collections import defaultdict, Counter + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.profiling_parser.base_profiling_parser import BaseProfilingParser +from compare_backend.utils.constant import Constant + + +class GPUProfilingParser(BaseProfilingParser): + CUBE_MARK = ['gemm', 'conv', 'cutlass', 'wgrad'] + FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel'], ['attention', 'kernel']] + SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)'] + FLOW_CAT = ("async_gpu", "async_cpu_to_gpu", "ac2g", "async") + TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator", "runtime") + + def __init__(self, args: any, path_dict: dict): + super().__init__(args, path_dict) + self._trace_events = [TraceEventBean(event) for event in self._trace_events.get("traceEvents", [])] + self._flow_cat = (args.gpu_flow_cat,) if args.gpu_flow_cat else self.FLOW_CAT + self._compute_stream_id = self._infer_compute_stream_id() + self._marks = defaultdict(int) + self._aten_index = 0 + + @classmethod + def __is_flash_attention(cls, name: str): + for fa_mark in cls.FA_MARK_LIST: + if not [1 for mark in fa_mark if mark not in name.lower()]: + return True + return False + + @classmethod + def __is_sdma_time(cls, name: str): + for mark in cls.SDMA_MARK_LIST: + if mark in name.lower(): + return True + return False + + def _update_memory_list(self): + if not self._enable_memory_compare: + return + self._memory_events.sort(key=lambda x: x.start_time) + addr_dict = {} + for memory_event in self._memory_events: + allocate_bytes = memory_event.bytes_kb + record = addr_dict.get(memory_event.addr) + if allocate_bytes > 0: + if record: + self._result_data.update_memory_list(record) + addr_dict[memory_event.addr] = {Constant.SIZE: allocate_bytes, + Constant.TS: memory_event.start_time, + Constant.ALLOCATION_TIME: memory_event.start_time} + if allocate_bytes < 0 and record: + if abs(allocate_bytes) == record.get(Constant.SIZE): + record[Constant.RELEASE_TIME] = memory_event.start_time + self._result_data.update_memory_list(record) + del addr_dict[memory_event.addr] + for record in addr_dict.values(): + self._result_data.update_memory_list(record) + + def _update_overall_metrics(self): + self._calculate_performance_time() + self.__parse_memory_reserved() + self._result_data.overall_metrics.calculate_vec_time() + self._result_data.overall_metrics.calculate_schedule_time() + self._result_data.overall_metrics.trans_time_to_s() + + def _calculate_performance_time(self): + min_ts = sys.float_info.max + max_ts = sys.float_info.min + self._trace_events.sort(key=lambda x: x.start_time) + aten_events = list(filter(lambda x: x.name.startswith("aten::"), self._trace_events)) + flow_dict_new = {} + for flow_event in self._flow_dict.values(): + start_event = flow_event.get("start") + end_event = flow_event.get("end") + if start_event and end_event: + flow_dict_new[end_event.start_time] = start_event.start_time + for event in self._trace_events: + if event.stream: + min_ts = min(event.start_time, min_ts) + max_ts = max(event.end_time, max_ts) + if event.stream == self._compute_stream_id and self.__is_sdma_time(event.name): + self._result_data.overall_metrics.update_sdma_info(event.dur) + continue + if not event.is_kernel_cat(): + continue + self.__add_marks(event) + if event.is_nccl_name(): + continue + self.__add_compute_time(event, aten_events, flow_dict_new) + self._aten_events = None + self._result_data.overall_metrics.set_e2e_time(float(max_ts - min_ts)) + self.__add_compute_and_overlap_time() + + def __add_compute_and_overlap_time(self): + compute_time = len([_ for _, value in self._marks.items() if value < 0]) + communication_not_overlapped = len([_ for _, value in self._marks.items() if value > 0]) + self._result_data.overall_metrics.set_compute_time(compute_time) + self._result_data.overall_metrics.set_comm_not_overlap(communication_not_overlapped) + + def __add_marks(self, event: TraceEventBean): + if event.is_nccl_name(): + for timestep in range(int(event.start_time + 1), int(event.end_time + 1)): + self._marks[str(timestep)] += 1 # mark this timestep in communication stream + else: + for timestep in range(int(event.start_time + 1), int(event.end_time + 1)): + self._marks[str(timestep)] += -100 # mark this timestep in compute stream + + def __add_compute_time(self, event: TraceEventBean, aten_events: list, flow_dict_new: dict): + if self.__is_flash_attention(event.name): + if event.is_backward(): + self._result_data.overall_metrics.update_fa_bwd_info(event.dur) + else: + self._result_data.overall_metrics.update_fa_fwd_info(event.dur) + elif any(cube_mark in event.lower_name for cube_mark in self.CUBE_MARK): + is_conv = self.__check_is_conv(event, aten_events, flow_dict_new) + if is_conv == "conv_fwd": + self._result_data.overall_metrics.update_conv_fwd_info(event.dur) + elif is_conv == "conv_bwd": + self._result_data.overall_metrics.update_conv_bwd_info(event.dur) + else: + self._result_data.overall_metrics.update_cube_info(event.dur) + else: + self._result_data.overall_metrics.update_vec_info(event.dur) + + def __check_is_conv(self, event: TraceEventBean, aten_events: list, flow_dict_new: dict) -> str: + flow_start_time = flow_dict_new.get(event.start_time) + if not flow_start_time: + return "" + aten_len = len(aten_events) + while self._aten_index < aten_len: + cur_aten = aten_events[self._aten_index] + if cur_aten.end_time < flow_start_time: + self._aten_index += 1 + continue + if cur_aten.start_time < flow_start_time: + if cur_aten.is_conv(): + return "conv_bwd" if cur_aten.is_backward() else "conv_fwd" + return "" + + def _picking_memory_event(self, event: TraceEventBean): + if event.is_memory_event(): + self._memory_events.append(event) + return True + return False + + def _is_torch_op_event(self, event: TraceEventBean): + return event.lower_cat in self.TORCH_OP_CAT + + def _is_kernel_event(self, event: TraceEventBean): + return event.is_kernel_cat() + + def _is_flow_event(self, event: TraceEventBean): + return event.lower_cat in self._flow_cat + + def __parse_memory_reserved(self): + if not self._memory_events: + print("[INFO] Gpu profiling data doesn't contain memory info.") + return + memory_used = max([event.total_reserved for event in self._memory_events]) / 1024 ** 3 + self._result_data.overall_metrics.set_memory_used(memory_used) + + def _get_dispatch_func(self): + func_set = set() + if self._enable_memory_compare or self._enable_operator_compare: + func_set.add(self._picking_torch_op_event) + if self._enable_communication_compare: + func_set.add(self._picking_kernel_event) + if self._enable_operator_compare: + func_set.add(self._picking_python_function_event) + func_set.add(self._picking_fwdbwd_flow_event) + if self._enable_operator_compare or self._args.max_kernel_num: + func_set.add(self._picking_kernel_event) + func_set.add(self._picking_flow_event) + if self._enable_memory_compare or self._enable_profiling_compare: + func_set.add(self._picking_memory_event) + return list(func_set) + + def _infer_compute_stream_id(self): + if not self._enable_profiling_compare: + return -1 + kernel_stream_ids = [] + for event in self._trace_events: + if event.is_kernel_except_nccl() and event.stream: + kernel_stream_ids.append(event.stream) + if not kernel_stream_ids: + raise RuntimeError('[ERROR] The profiling data does not contain kernel running data.') + counter = Counter(kernel_stream_ids) + return counter.most_common(1)[0][0] diff --git a/profiler/compare_tools_review/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools_review/compare_backend/profiling_parser/npu_profiling_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..70ce44b44eb419196dc479dc30ae0b1e4a1136cb --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/profiling_parser/npu_profiling_parser.py @@ -0,0 +1,323 @@ +import os +import sys +from math import ceil + +from compare_backend.compare_bean.origin_data_bean.kernel_details_bean import KernelDetailsBean +from compare_backend.compare_bean.origin_data_bean.memory_record_bean import MemoryRecordBean +from compare_backend.compare_bean.origin_data_bean.operator_memory_bean import OperatorMemoryBean +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.profiling_parser.base_profiling_parser import BaseProfilingParser +from compare_backend.utils.constant import Constant +from compare_backend.utils.file_reader import FileReader + + +class NPUProfilingParser(BaseProfilingParser): + FLOW_CAT = "async_npu" + TORCH_OP_CAT = "cpu_op" + ACTIVE_CPU = "ProfilerActivity.CPU" + LEVEL_0 = "Level0" + + def __init__(self, args: any, path_dict: dict): + super().__init__(args, path_dict) + self._operator_memory_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "operator_memory.csv") + self._memory_record_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "memory_record.csv") + self._kernel_detail_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "kernel_details.csv") + self._info_json_path = path_dict.get(Constant.INFO_JSON_PATH, "") + self._trace_events = [TraceEventBean(event) for event in self._trace_events] + self._hccl_pid = None + self._hccl_op_tid_list = [] + self._kernel_pid = None + self._overlap_pid = None + self._enqueue_dict = {} + self._dequeue_data = [] + self._overlap_analysis = [] + self._dispatch_func = self._get_dispatch_func() + self._filter_meta_id() + + def _get_dispatch_func(self): + func_list = set() + if self._enable_memory_compare or self._enable_operator_compare: + func_list.add(self._picking_torch_op_event) + if self._enable_operator_compare or self._args.max_kernel_num: + func_list.add(self._picking_kernel_event) + func_list.add(self._picking_flow_event) + if self._enable_operator_compare: + func_list.add(self._picking_python_function_event) + func_list.add(self._picking_fwdbwd_flow_event) + if self._enable_memory_compare: + func_list.add(self._picking_task_queue_data) + if self._enable_communication_compare: + func_list.add(self._picking_hccl_event) + if self._enable_profiling_compare: + func_list.add(self._picking_overlap_analysis_data) + func_list.add(self._picking_kernel_event) + func_list.add(self._picking_hccl_event) + return list(func_list) + + def _update_memory_list(self): + try: + memory_data = FileReader.read_csv_file(self._operator_memory_path, OperatorMemoryBean) + except FileNotFoundError: + print("[WARNING] The file operator_memory.csv does not exist.") + return + except Exception: + print("[ERROR] Failed to read operator_memory.csv.") + return + if memory_data: + self._dequeue_data.sort(key=lambda x: x.start_time) + for data in memory_data: + if not data.allocation_time: + continue + if data.is_cann_op(): + matched_corr_id = self.__match_dequeue_data(data.allocation_time) + if matched_corr_id == Constant.INVALID_VALUE: + continue + self._result_data.update_memory_list({Constant.SIZE: data.size, + Constant.TS: self._enqueue_dict.get(matched_corr_id, 0), + Constant.NAME: data.name, + Constant.ALLOCATION_TIME: data.allocation_time, + Constant.RELEASE_TIME: data.release_time}) + else: + self._result_data.update_memory_list({Constant.SIZE: data.size, + Constant.TS: data.allocation_time, + Constant.ALLOCATION_TIME: data.allocation_time, + Constant.RELEASE_TIME: data.release_time}) + + def __match_dequeue_data(self, ts_time: float) -> int: + if not self._dequeue_data: + return Constant.INVALID_VALUE + left, right = 0, len(self._dequeue_data) - 1 + while right > left: + mid = left + ceil((right - left) / 2) + if ts_time >= self._dequeue_data[mid].start_time: + left = mid + else: + right = mid - 1 + return self._dequeue_data[left].corr_id if self._dequeue_data[left].start_time <= ts_time <= \ + self._dequeue_data[left].end_time else Constant.INVALID_VALUE + + def _update_overall_metrics(self): + self.__parse_info_json() + self.__parse_mem_csv() + self.__parse_kernel_csv() + self.__add_lccl_time() + self.__add_sdma_time() + self.__add_overlap_analysis_time() + self._picking_notify_wait_event_and_not_overlap_event() + self.__add_overlap_wait_time() + self._result_data.overall_metrics.calculate_other_time() + self._result_data.overall_metrics.calculate_schedule_time() + self._result_data.overall_metrics.trans_time_to_s() + + def _picking_notify_wait_event_and_not_overlap_event(self): + self.notify_event_cache = [] + self._not_overlaped_commu_event = [] + for event in self._comm_task_list: + if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK' \ + and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_ACK': + self.notify_event_cache.append(event) + for event in self._overlap_analysis: + if event.is_comm_not_overlap(): + self._not_overlaped_commu_event.append(event) + self._not_overlaped_commu_event.sort(key=lambda x: x.start_time) + + def __add_overlap_wait_time(self): + notify_wait_event_dict = dict() + for notify_event in self.notify_event_cache: + if notify_event.tid in notify_wait_event_dict: + notify_wait_event_dict[notify_event.tid].append(notify_event) + else: + notify_wait_event_dict[notify_event.tid] = [notify_event] + + if self._result_data.overall_metrics.is_level0: + return + + total_time = 0 + for commu_event in self._not_overlaped_commu_event: + wait_time_list = [0] + commu_event_start_time = float(commu_event.start_time) + commu_event_end_time = float(commu_event.start_time) + commu_event.dur + + for plane_id, events in notify_wait_event_dict.items(): + wait_time = 0 + idx = 0 + for notify_event in events: + notify_event_start_time = float(notify_event.start_time) + notify_event_end_time = float(notify_event.start_time) + notify_event.dur + if notify_event_start_time < commu_event_start_time and notify_event_end_time > \ + commu_event_end_time: + wait_time = commu_event_end_time - commu_event_start_time + break + elif notify_event_start_time < commu_event_start_time <= notify_event_end_time <= \ + commu_event_end_time: + wait_time += notify_event_end_time - commu_event_start_time + idx += 1 + elif commu_event_start_time <= notify_event_start_time <= commu_event_end_time < \ + notify_event_end_time: + wait_time += commu_event_end_time - notify_event_start_time + break + elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= \ + commu_event_end_time: + wait_time += notify_event_end_time - notify_event_start_time + idx += 1 + elif notify_event_end_time < commu_event_start_time: + idx += 1 + else: + break + + wait_time_list.append(wait_time) + notify_wait_event_dict[plane_id] = notify_wait_event_dict[plane_id][idx:] + total_time += max(wait_time_list) + self._result_data.overall_metrics.update_comm_not_overlap_wait_time(total_time) + + def _picking_hccl_event(self, event: TraceEventBean): + if event.pid != self._hccl_pid or not event.is_x_mode(): + return False + if event.tid in self._hccl_op_tid_list: + self._comm_list.append(event) + else: + self._comm_task_list.append(event) + return True + + def _picking_task_queue_data(self, event: TraceEventBean): + if event.is_enqueue(): + self._enqueue_dict[event.corr_id] = event.start_time + return True + elif event.is_dequeue(): + self._dequeue_data.append(event) + return True + return False + + def _picking_overlap_analysis_data(self, event: TraceEventBean): + if event.pid == self._overlap_pid and event.is_x_mode(): + self._overlap_analysis.append(event) + return True + return False + + def _is_kernel_event(self, event: TraceEventBean): + return event.pid == self._kernel_pid and event.is_x_mode() + + def _is_flow_event(self, event: TraceEventBean): + return event.lower_cat == self.FLOW_CAT + + def _is_torch_op_event(self, event: TraceEventBean): + return event.lower_cat == self.TORCH_OP_CAT + + def _filter_meta_id(self): + for event in self._trace_events: + if not event.is_process_meta(): + continue + if event.is_hccl_process_name(): + self._hccl_pid = event.pid + elif event.is_npu_process_name(): + self._kernel_pid = event.pid + elif event.is_overlap_process_name(): + self._overlap_pid = event.pid + if not self._enable_communication_compare: + return + for event in self._trace_events: + if not event.is_thread_meta(): + continue + if event.pid == self._hccl_pid and event.is_communication_op_thread(): + self._hccl_op_tid_list.append(event.tid) + + def __parse_info_json(self): + try: + json_data = FileReader.read_trace_file(self._info_json_path) + except Exception: + print('[WARNING] Failed to read profiler_info.json.') + return + if not isinstance(json_data, dict) or not json_data: + print('[WARNING] Invalid profiler info.') + return + level = json_data.get('config', {}).get('experimental_config', {}).get('_profiler_level', '') + if self.LEVEL_0 != level: + return + self._result_data.overall_metrics.is_level0 = True + if self.ACTIVE_CPU in json_data.get('config', {}).get('common_config', {}).get('activities', []): + return + self._result_data.overall_metrics.minimal_profiling = True + + def __add_lccl_time(self): + for event in self._all_kernels.values(): + if event.is_lccl(): + self._result_data.overall_metrics.update_lccl_info(event.dur) + + def __parse_kernel_csv(self): + try: + kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) + except Exception: + print('[WARNING] Npu kernel details csv file is not available.') + return + if not kernel_details or kernel_details[0].is_hide_op_pmu(): + self._result_data.overall_metrics.hide_op_details = True + return + for kernel in kernel_details: + if kernel.is_invalid(): + continue + if kernel.is_flash_attention(): + if kernel.is_fa_bwd(): + self._result_data.overall_metrics.update_fa_bwd_info(kernel.duration) + else: + self._result_data.overall_metrics.update_fa_fwd_info(kernel.duration) + elif kernel.is_conv(): + if kernel.is_conv_bwd(): + self._result_data.overall_metrics.update_conv_bwd_info(kernel.duration) + else: + self._result_data.overall_metrics.update_conv_fwd_info(kernel.duration) + elif kernel.is_cube(): + self._result_data.overall_metrics.update_cube_info(kernel.duration) + elif kernel.is_sdma(): + self._result_data.overall_metrics.update_sdma_info(kernel.duration) + elif kernel.is_page_attention(): + self._result_data.overall_metrics.update_pa_info(kernel.duration) + elif kernel.is_vector(): + self._result_data.overall_metrics.update_vec_info(kernel.duration) + else: + self._result_data.overall_metrics.update_cube_info(kernel.duration) + + def __parse_mem_csv(self): + try: + memory_record = FileReader.read_csv_file(self._memory_record_path, MemoryRecordBean) + except FileNotFoundError: + print('[INFO] Npu memory record csv file is not available.') + except Exception: + print('[WARNING] Load memory info failed.') + else: + memory_used = max([memory.total_reserved_mb for memory in memory_record]) / 1024 + self._result_data.overall_metrics.set_memory_used(memory_used) + + def __add_overlap_analysis_time(self): + if not self._overlap_analysis: + print('[ERROR] Failed to get overlap analysis data.') + return + min_ts = sys.float_info.max + max_ts = sys.float_info.min + for event in self._overlap_analysis: + if event.is_computing_event(): + self._result_data.overall_metrics.update_compute_time(event.dur) + min_ts = min(event.start_time, min_ts) + max_ts = max(event.end_time, max_ts) + elif event.is_comm_not_overlap(): + self._result_data.overall_metrics.update_comm_not_overlap(event.dur) + min_ts = min(event.start_time, min_ts) + max_ts = max(event.end_time, max_ts) + self._result_data.overall_metrics.set_e2e_time(float(max_ts - min_ts)) + + def __add_sdma_time(self) -> (float, int): + event_wait_stream, ai_core_stream = set(), set() + sdma_dict = {} + for event in self._all_kernels.values(): + stream_id = event.stream_id + if not stream_id: + continue + if event.is_event_wait(): + event_wait_stream.add(stream_id) + elif event.is_sdma_event(): + sdma_dict.setdefault(stream_id, []).append(event.dur) + elif event.is_compute_event(): + ai_core_stream.add(stream_id) + compute_stream = event_wait_stream & ai_core_stream if event_wait_stream else ai_core_stream + for stream in compute_stream: + dur_list = sdma_dict.get(stream, []) + self._result_data.overall_metrics.update_sdma_info(sum(dur_list), len(dur_list)) diff --git a/profiler/compare_tools_review/compare_backend/utils/__init__.py b/profiler/compare_tools_review/compare_backend/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/utils/args_manager.py b/profiler/compare_tools_review/compare_backend/utils/args_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..4b5947fa7bccc32277bb9d18d97ab71249c66941 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/args_manager.py @@ -0,0 +1,136 @@ +import os.path +import re + +from common_func.path_manager import PathManager +from compare_backend.utils.constant import Constant +from compare_backend.utils.file_reader import FileReader + + +class Singleton(object): + def __init__(self, cls): + self._cls = cls + self._instance = {} + + def __call__(self): + if self._cls not in self._instance: + self._instance[self._cls] = self._cls() + return self._instance[self._cls] + + +@Singleton +class ArgsManager: + + def __init__(self): + self._args = None + self._base_path_dict = {} + self._comparison_path_dict = {} + + @property + def args(self): + return self._args + + @property + def base_profiling_type(self): + return self._base_path_dict.get(Constant.PROFILING_TYPE) + + @property + def comparison_profiling_type(self): + return self._comparison_path_dict.get(Constant.PROFILING_TYPE) + + @property + def base_profiling_path(self): + return self._args.base_profiling_path + + @property + def comparison_profiling_path(self): + return self._args.comparison_profiling_path_dict + + @property + def base_path_dict(self): + return self._base_path_dict + + @property + def comparison_path_dict(self): + return self._comparison_path_dict + + @property + def enable_profiling_compare(self): + return self._args.enable_profiling_compare + + @property + def enable_operator_compare(self): + return self._args.enable_operator_compare + + @property + def enable_memory_compare(self): + return self._args.enable_memory_compare + + @property + def enable_communication_compare(self): + return self._args.enable_communication_compare + + @classmethod + def check_profiling_path(cls, file_path: str): + PathManager.input_path_common_check(file_path) + PathManager.check_path_owner_consistent(file_path) + + @classmethod + def check_output_path(cls, output_path: str): + PathManager.check_input_directory_path(output_path) + PathManager.make_dir_safety(output_path) + PathManager.check_path_writeable(output_path) + + def parse_profiling_path(self, file_path: str): + self.check_profiling_path(file_path) + if os.path.isfile(file_path): + (split_file_path, split_file_name) = os.path.split(file_path) + (shot_name, extension) = os.path.splitext(split_file_name) + if extension != ".json": + msg = f"Invalid profiling path suffix: {file_path}" + raise RuntimeError(msg) + json_type = FileReader.check_json_type(file_path) + return {Constant.PROFILING_TYPE: json_type, Constant.PROFILING_PATH: file_path, + Constant.TRACE_PATH: file_path} + ascend_output = os.path.join(file_path, "ASCEND_PROFILER_OUTPUT") + profiler_output = ascend_output if os.path.isdir(ascend_output) else file_path + json_path = os.path.join(profiler_output, "trace_view.json") + if not os.path.isfile(json_path): + msg = (f"The data is not collected by PyTorch Adaptor mode or the data is not parsed. " + f"Invalid profiling path: {profiler_output}") + raise RuntimeError(msg) + path_dict = {Constant.PROFILING_TYPE: Constant.NPU, Constant.PROFILING_PATH: file_path, + Constant.TRACE_PATH: json_path, Constant.ASCEND_OUTPUT_PATH: profiler_output} + sub_dirs = os.listdir(file_path) + for dir_name in sub_dirs: + if dir_name == "profiler_info.json" or re.match(r"profiler_info_[0-9]+\.json", dir_name): + path_dict.update({Constant.INFO_JSON_PATH: os.path.join(file_path, dir_name)}) + return path_dict + + def init(self, args: any): + self._args = args + if self._args.max_kernel_num is not None and self._args.max_kernel_num <= Constant.LIMIT_KERNEL: + msg = f"Invalid param, --max_kernel_num has to be greater than {Constant.LIMIT_KERNEL}" + raise RuntimeError(msg) + if not isinstance(self._args.op_name_map, dict): + raise RuntimeError( + "Invalid param, --op_name_map must be dict, for example: --op_name_map={'name1':'name2'}") + if self._args.gpu_flow_cat and len(self._args.gpu_flow_cat) > Constant.MAX_FLOW_CAT_LEN: + msg = f"Invalid param, --gpu_flow_cat exceeded the maximum value {Constant.MAX_FLOW_CAT_LEN}" + raise RuntimeError(msg) + + if not any([self._args.enable_profiling_compare, self._args.enable_operator_compare, + self._args.enable_memory_compare, self._args.enable_communication_compare]): + self._args.enable_profiling_compare = True + self._args.enable_operator_compare = True + self._args.enable_memory_compare = True + self._args.enable_communication_compare = True + + base_profiling_path = PathManager.get_realpath(self._args.base_profiling_path) + self.check_profiling_path(base_profiling_path) + self._base_path_dict = self.parse_profiling_path(base_profiling_path) + comparison_profiling_path = PathManager.get_realpath(self._args.comparison_profiling_path) + self.check_profiling_path(comparison_profiling_path) + self._comparison_path_dict = self.parse_profiling_path(comparison_profiling_path) + + if self._args.output_path: + self.check_output_path(PathManager.get_realpath(self._args.output_path)) diff --git a/profiler/compare_tools_review/compare_backend/utils/common_func.py b/profiler/compare_tools_review/compare_backend/utils/common_func.py new file mode 100644 index 0000000000000000000000000000000000000000..a3cab286e33a9d474e85d0b51023d73edc22ca56 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/common_func.py @@ -0,0 +1,92 @@ +from decimal import Decimal + +import numpy + + +def calculate_diff_ratio(base_value: float, comparison_value: float): + if not base_value and not comparison_value: + ratio = 1.0 + else: + ratio = float('inf') if not base_value else comparison_value / base_value + return [comparison_value - base_value, ratio] + + +def update_order_id(data_list: list): + for index, data in enumerate(data_list): + if data: + data[0] = index + 1 + + +def convert_to_float(data: any) -> float: + try: + float_value = float(data) + except Exception: + print('[ERROR] Invalid profiling data which failed to convert data to float.') + return 0.0 + return float_value + + +def convert_to_decimal(data: any) -> Decimal: + try: + decimal_value = Decimal(data) + except Exception: + print('[ERROR] Invalid profiling data which failed to convert data to decimal.') + return 0.0 + return decimal_value + + +def longest_common_subsequence_matching(base_ops: list, comparison_ops: list, name_func: any) -> list: + if not comparison_ops: + result_data = [None] * len(base_ops) + for index, value in enumerate(base_ops): + result_data[index] = [value, None] + return result_data + + comparison_len, base_len = len(comparison_ops), len(base_ops) + dp_flag = numpy.zeros(shape=(comparison_len + 1, base_len + 1), dtype=int) + pre_list = [0] * (base_len + 1) + cur_list = [0] * (base_len + 1) + + comparison_index = 1 + iter_comparison_data = iter(comparison_ops) + for comparison_data in iter_comparison_data: + base_index = 1 + iter_base_data = iter(base_ops) + for base_data in iter_base_data: + if name_func(comparison_data) == name_func(base_data): + cur_list[base_index] = pre_list[base_index - 1] + 1 + else: + only_base = cur_list[base_index - 1] + only_comparison = pre_list[base_index] + if only_base < only_comparison: + dp_flag[comparison_index][base_index] = 1 # 1 for only comparison op + cur_list[base_index] = only_comparison + else: + cur_list[base_index] = only_base + base_index += 1 + pre_list = cur_list + comparison_index += 1 + + matched_op = [] + comparison_index, base_index = comparison_len, base_len + while comparison_index > 0 and base_index > 0: + base_data = base_ops[base_index - 1] + comparison_data = comparison_ops[comparison_index - 1] + if name_func(base_data) == name_func(comparison_data): + matched_op.append([base_data, comparison_data]) + comparison_index -= 1 + base_index -= 1 + elif dp_flag[comparison_index][base_index] == 1: # 1 for only comparison op + matched_op.append([None, comparison_data]) + comparison_index -= 1 + else: + matched_op.append([base_data, None]) + base_index -= 1 + while comparison_index > 0: + matched_op.append([None, comparison_ops[comparison_index - 1]]) + comparison_index -= 1 + while base_index > 0: + matched_op.append([base_ops[base_index - 1], None]) + base_index -= 1 + matched_op.reverse() + return matched_op diff --git a/profiler/compare_tools_review/compare_backend/utils/compare_args.py b/profiler/compare_tools_review/compare_backend/utils/compare_args.py new file mode 100644 index 0000000000000000000000000000000000000000..ab9bc364f440ca8412a6e40d67ca74b7c897cbd9 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/compare_args.py @@ -0,0 +1,24 @@ +class Args: + def __init__(self, + base_profiling_path: str = "", + comparison_profiling_path: str = "", + enable_profiling_compare: bool = False, + enable_operator_compare: bool = False, + enable_memory_compare: bool = False, + enable_communication_compare: bool = False, + output_path: str = "", + max_kernel_num: int = None, + op_name_map: dict = {}, + use_input_shape: bool = False, + gpu_flow_cat: str = ""): + self.base_profiling_path = base_profiling_path + self.comparison_profiling_path = comparison_profiling_path + self.enable_profiling_compare = enable_profiling_compare + self.enable_operator_compare = enable_operator_compare + self.enable_memory_compare = enable_memory_compare + self.enable_communication_compare = enable_communication_compare + self.output_path = output_path + self.max_kernel_num = max_kernel_num + self.op_name_map = op_name_map + self.use_input_shape = use_input_shape + self.gpu_flow_cat = gpu_flow_cat diff --git a/profiler/compare_tools_review/compare_backend/utils/constant.py b/profiler/compare_tools_review/compare_backend/utils/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..1b77b214c85f6733e36298e119e43a778fd7969f --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/constant.py @@ -0,0 +1,80 @@ +class Constant(object): + GPU = "GPU" + NPU = "NPU" + NA = 'N/A' + LIMIT_KERNEL = 3 + MAX_PATH_LENGTH = 4096 + MAX_FLOW_CAT_LEN = 20 + MAX_FILE_SIZE = 1024 * 1024 * 1024 * 5 + BYTE_TO_KB = 1024 + YELLOW_COLOR = "FFFF00" + GREEN_COLOR = "00FF00" + RED_COLOR = "FF0000" + BLUE_COLOR = "00BFFF" + US_TO_MS = 1000 + KB_TO_MB = 1024 + INVALID_VALUE = -1 + + # epsilon + EPS = 1e-15 + + # autority + FILE_AUTHORITY = 0o640 + DIR_AUTHORITY = 0o750 + + PROFILING_TYPE = "profiling type" + + # path + PROFILING_PATH = "profiling_path" + TRACE_PATH = "trace_path" + MEMORY_DATA_PATH = "memory_data_path" + ASCEND_OUTPUT_PATH = "ascend_output" + INFO_JSON_PATH = "info_path" + + # excel headers + BASE_PROFILING = 'Base Profiling: ' + COMPARISON_PROFILING = 'Comparison Profiling: ' + + # compare type + OPERATOR_COMPARE = "OperatorCompare" + MEMORY_COMPARE = "MemoryCompare" + + # sheet name + OPERATOR_SHEET = "OperatorCompare" + MEMORY_SHEET = "MemoryCompare" + OPERATOR_TOP_SHEET = "OperatorCompareStatistic" + MEMORY_TOP_SHEET = "MemoryCompareStatistic" + COMMUNICATION_SHEET = "CommunicationCompare" + + # table name + OPERATOR_TABLE = "OperatorCompare" + MEMORY_TABLE = "MemoryCompare" + OPERATOR_TOP_TABLE = "OperatorCompareStatistic" + MEMORY_TOP_TABLE = "MemoryCompareStatistic" + COMMUNICATION_TABLE = "CommunicationCompare" + PERFORMANCE_TABLE = "Model Profiling Time Distribution" + MODULE_TABLE = "ModuleCompare" + MODULE_TOP_TABLE = "ModuleCompareStatistic" + + # memory + SIZE = "Size(KB)" + TS = "ts" + ALLOCATION_TIME = "Allocation Time(us)" + RELEASE_TIME = "Release Time(us)" + NAME = "Name" + + OP_KEY = "op_name" + DEVICE_DUR = "dur" + + BASE_DATA = "base_data" + COMPARISON_DATA = "comparison_data" + OVERALL_METRICS = "overall_metrics" + TORCH_OP = "torch_op" + KERNEL_DICT = "kernel_dict" + MEMORY_LIST = "memory_list" + COMMUNICATION_DICT = "comm_dict" + + #compare type + OVERALL_COMPARE = "overall" + + BWD_LIST = ["bwd", "backward", "back"] diff --git a/profiler/compare_tools_review/compare_backend/utils/excel_config.py b/profiler/compare_tools_review/compare_backend/utils/excel_config.py new file mode 100644 index 0000000000000000000000000000000000000000..306abcdfec6e62f24977b989258ad190a90c9bd7 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/excel_config.py @@ -0,0 +1,185 @@ +from compare_backend.utils.constant import Constant + + +class CellFormatType: + DEFAULT = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'num_format': '#,##0'} # 数字显示整数,无背景色 + DEFAULT_FLOAT = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'num_format': '#,##0.00'} # 保留2位小数,无背景色 + DEFAULT_RATIO = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', + 'border': True, 'num_format': '0.00%'} # 百分比显示,保留2位小数,无背景色 + RED_RATIO = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', + 'border': True, 'num_format': '0.00%', "fg_color": Constant.RED_COLOR} # 百分比显示,保留2位小数,单元格背景色为红色 + BOLD_STR = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'bold': True} # 字符串,无背景色,字体加粗 + BLUE_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.BLUE_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 蓝色背景,加粗 + GREEN_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.GREEN_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 绿色背景,加粗 + YELLOW_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.YELLOW_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 黄色背景,加粗 + + +class ExcelConfig(object): + ORDER = "Order Id" + OPERATOR_NAME = "Operator Name" + INPUT_SHAPE = "Input Shape" + INPUT_TYPE = "Input Type" + KERNEL_DETAILS = "Kernel Details" + MEMORY_DETAILS = "Allocated Details" + DEVICE_DURATION = "Device Duration(us)" + DIFF_RATIO = "Diff Ratio" + DIFF_DUR = "Diff Duration(us)" + DIFF_SIZE = "Diff Size(KB)" + SIZE = "Size(KB)" + TOP = "Top" + BASE_DEVICE_DURATION = "Base Device Duration(ms)" + COMPARISON_DEVICE_DURATION = "Comparison Device Duration(ms)" + BASE_OPERATOR_NUMBER = "Base Operator Number" + COMPARISON_OPERATOR_NUMBER = "Comparison Operator Number" + DIFF_TIME = "Diff Duration(ms)" + BASE_ALLOCATED_TIMES = "Base Allocated Duration(ms)" + COMPARISON_ALLOCATED_TIMES = "Comparison Allocated Duration(ms)" + BASE_ALLOCATED_MEMORY = "Base Allocated Memory(MB)" + COMPARISON_ALLOCATED_MEMORY = "Comparison Allocated Memory(MB)" + DIFF_MEMORY = "Diff Memory(MB)" + COMM_OP_NAME = "Communication OP Name" + TASK_NAME = "Task Name" + CALLS = "Calls" + TOTAL_DURATION = "Total Duration(us)" + AVG_DURATION = "Avg Duration(us)" + MAX_DURATION = "Max Duration(us)" + MIN_DURATION = "Min Duration(us)" + MODULE_CLASS = "Module Class" + MODULE_NAME = "Module Name" + DEVICE_SELF_TIME = "Device Self Time(ms)" + DEVICE_TOTAL_TIME = "Device Total Time(ms)" + DIFF_SELF_TIME = "Device Self Time Diff(ms)" + DIFF_TOTAL_RATIO = "Total Diff Ratio" + DIFF_TOTAL_TIME = "Device Total Time Diff(ms)" + DEVICE_SELF_TIME_US = "Device Self Time(us)" + DEVICE_TOTAL_TIME_US = "Device Total Time(us)" + DIFF_SELF_TIME_US = "Device Self Time Diff(us)" + DIFF_TOTAL_TIME_US = "Device Total Time Diff(us)" + NUMBER = "Number" + MODULE_LEVEL = "Module Level" + BASE_CALL_STACK = "Base Call Stack" + COMPARISON_CALL_STACK = "Comparison Call Stack" + + HEADERS = { + Constant.OPERATOR_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MEMORY_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MEMORY_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MEMORY_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.OPERATOR_TOP_TABLE: [ + {"name": TOP, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": BASE_DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 25}, + {"name": BASE_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": COMPARISON_DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 30}, + {"name": COMPARISON_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": DIFF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MEMORY_TOP_TABLE: [ + {"name": TOP, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": BASE_ALLOCATED_TIMES, "type": CellFormatType.DEFAULT_FLOAT, "width": 25}, + {"name": BASE_ALLOCATED_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 30}, + {"name": BASE_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": COMPARISON_ALLOCATED_TIMES, "type": CellFormatType.DEFAULT_FLOAT, "width": 27}, + {"name": COMPARISON_ALLOCATED_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 33}, + {"name": COMPARISON_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": DIFF_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.COMMUNICATION_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": COMM_OP_NAME, "type": CellFormatType.BOLD_STR, "width": 25}, + {"name": TASK_NAME, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": COMM_OP_NAME, "type": CellFormatType.BOLD_STR, "width": 25}, + {"name": TASK_NAME, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MODULE_TOP_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": MODULE_CLASS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MODULE_LEVEL, "type": CellFormatType.DEFAULT, "width": 15}, + {"name": MODULE_NAME, "type": CellFormatType.DEFAULT, "width": 35}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, + {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} + ], + Constant.MODULE_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": MODULE_CLASS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MODULE_LEVEL, "type": CellFormatType.DEFAULT, "width": 15}, + {"name": MODULE_NAME, "type": CellFormatType.DEFAULT, "width": 35}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DEVICE_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DEVICE_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, + {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} + ] + } + + OVERHEAD = {Constant.OPERATOR_TABLE: ["B1:F1", "G1:K1"], Constant.MEMORY_TABLE: ["B1:F1", "G1:K1"], + Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"], + Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"], Constant.MODULE_TOP_TABLE: ["F1:I1", "J1:M1"], + Constant.MODULE_TABLE: ["E1:H1", "I1:L1"]} diff --git a/profiler/compare_tools_review/compare_backend/utils/file_reader.py b/profiler/compare_tools_review/compare_backend/utils/file_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..b4ae786388b2f1bed6ad50cfb39ac8621c1ea1f1 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/file_reader.py @@ -0,0 +1,64 @@ +import csv +import json +import os + +from common_func.path_manager import PathManager +from compare_backend.utils.constant import Constant + + +class FileReader: + + @classmethod + def read_trace_file(cls, file_path: str) -> any: + PathManager.check_path_readable(file_path) + if not os.path.isfile(file_path): + raise FileNotFoundError("File not exists.") + file_size = os.path.getsize(file_path) + if file_size <= 0: + return [] + if file_size > Constant.MAX_FILE_SIZE: + check_msg = input( + f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") + if check_msg.lower() != "y": + print(f"[WARNING] The user choose not to read the file: {file_path}") + return [] + try: + with open(file_path, "rt") as file: + json_data = json.loads(file.read()) + except Exception as e: + msg = f"Can't read file: {file_path}" + raise RuntimeError(msg) from e + return json_data + + @classmethod + def read_csv_file(cls, file_path: str, bean_class: any = None) -> any: + PathManager.check_path_readable(file_path) + if not os.path.isfile(file_path): + raise FileNotFoundError("File not exists.") + file_size = os.path.getsize(file_path) + if file_size <= 0: + return [] + if file_size > Constant.MAX_FILE_SIZE: + check_msg = input( + f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") + if check_msg.lower() != "y": + print(f"[WARNING] The user choose not to read the file: {file_path}") + return [] + result_data = [] + try: + with open(file_path, newline="") as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + row_data = bean_class(row) if bean_class else row + result_data.append(row_data) + except Exception as e: + msg = f"Failed to read the file: {file_path}" + raise RuntimeError(msg) from e + return result_data + + @classmethod + def check_json_type(cls, file_path: str) -> str: + json_data = cls.read_trace_file(file_path) + if isinstance(json_data, dict): + return Constant.GPU + return Constant.NPU diff --git a/profiler/compare_tools_review/compare_backend/utils/module_node.py b/profiler/compare_tools_review/compare_backend/utils/module_node.py new file mode 100644 index 0000000000000000000000000000000000000000..f85606094ede7abc378c1b3d017b4a98c8800107 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/module_node.py @@ -0,0 +1,171 @@ +import re +from math import ceil + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.torch_op_node import TorchOpNode + + +class ModuleNode: + ts = "ts" + kernels = "kernels" + + def __init__(self, event: TraceEventBean, parent_node=None): + self._event = event + self._parent_node = parent_node + self._child_nodes = [] + self._module_name = f"{parent_node.module_name}/{event.name}" if parent_node else event.name + self._module_level = parent_node.module_level + 1 if parent_node else 1 + self._kernel_self_list = [] + self._kernel_total_list = [] + self._call_stack = f"{parent_node.call_stack};\n{event.name}" if parent_node and parent_node.call_stack \ + else event.name + self._root_torch_op_node = TorchOpNode() + self._cur_torch_op_node = self._root_torch_op_node + + @property + def module_name(self): + return self._module_name + + @property + def module_class(self): + pattern = re.compile('_[0-9]+$') + return pattern.sub('', self.name.split("/")[-1]) + + @property + def module_level(self): + return self._module_level + + @property + def name(self): + return self._event.name + + @property + def parent_node(self): + return self._parent_node + + @property + def child_nodes(self): + return self._child_nodes + + @property + def dur(self): + return self._event.dur + + @property + def start_time(self): + return self._event.start_time + + @property + def end_time(self): + return self._event.end_time + + @property + def host_self_dur(self): + return self.dur - sum([node.dur for node in self.child_nodes]) + + @property + def device_self_dur(self): + dur = 0 + for kernel_dict in self._kernel_self_list: + kernel_list = kernel_dict.get(self.kernels, []) + dur += sum([kernel.device_dur for kernel in kernel_list]) + return dur + + @property + def device_total_dur(self): + dur = 0 + for kernel_dict in self._kernel_total_list: + kernel_list = kernel_dict.get(self.kernels, []) + dur += sum([kernel.device_dur for kernel in kernel_list]) + return dur + + @property + def kernel_details(self): + kernel_details = "" + for kernel_dict in self._kernel_self_list: + kernel_list = kernel_dict.get(self.kernels, []) + for kernel in kernel_list: + kernel_details += kernel.kernel_details + return kernel_details + + @property + def toy_layer_api_list(self): + return self._root_torch_op_node.child_nodes + + @property + def call_stack(self): + return self._call_stack + + @staticmethod + def _binary_search(ts_time, parent_node): + if not parent_node.child_nodes: + return None + right = len(parent_node.child_nodes) - 1 + left = 0 + while right > left: + mid = left + ceil((right - left) / 2) + if ts_time >= parent_node.child_nodes[mid].start_time: + left = mid + else: + right = mid - 1 + if parent_node.child_nodes[left].start_time < ts_time < parent_node.child_nodes[left].end_time: + return parent_node.child_nodes[left] + return None + + def reset_call_stack(self, call_stack): + self._call_stack = call_stack + + def update_child_nodes(self, node): + self._child_nodes.append(node) + + def update_kernel_list(self, ts, kernel_list: list): + self._update_kernel_self_list(ts, kernel_list) + node = self + while node.parent_node: + node._update_kernel_total_list(ts, kernel_list) + node = node.parent_node + + def _update_kernel_self_list(self, ts, kernel_list: list): + self._kernel_self_list.append({self.ts: ts, self.kernels: kernel_list}) + + def _update_kernel_total_list(self, ts, kernel_list: list): + self._kernel_total_list.append({self.ts: ts, self.kernels: kernel_list}) + + def find_module_call(self, ts_time): + call_module = self._binary_search(ts_time, self) + while call_module: + module = self._binary_search(ts_time, call_module) + if not module: + return call_module + call_module = module + return call_module + + def find_torch_op_call(self, event): + while self._cur_torch_op_node: + if self._cur_torch_op_node != self._root_torch_op_node and \ + event.start_time > self._cur_torch_op_node.end_time: + self._cur_torch_op_node = self._cur_torch_op_node.parent + continue + tree_node = TorchOpNode(event, self._cur_torch_op_node) + self._cur_torch_op_node.add_child_node(tree_node) + self._cur_torch_op_node = tree_node + break + + def update_torch_op_kernel_list(self): + top_node_list = self._root_torch_op_node.child_nodes + if not top_node_list: + return + top_node_list.sort(key=lambda x: x.start_time) + cur_index = 0 + self._kernel_self_list.sort(key=lambda x: x.get(self.ts, 0)) + for kernel_dict in self._kernel_self_list: + ts = kernel_dict.get(self.ts, 0) + kernel_list = kernel_dict.get(self.kernels, []) + while cur_index < len(top_node_list): + if ts > top_node_list[cur_index].end_time: + cur_index += 1 + continue + if ts < top_node_list[cur_index].start_time: + break + top_node_list[cur_index].update_kernel_list(kernel_list) + break diff --git a/profiler/compare_tools_review/compare_backend/utils/name_function.py b/profiler/compare_tools_review/compare_backend/utils/name_function.py new file mode 100644 index 0000000000000000000000000000000000000000..cd79e8a03fa7a970ce97ad59f14fae12766f096b --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/name_function.py @@ -0,0 +1,52 @@ +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.torch_op_node import TorchOpNode + + +class NameFunction: + def __init__(self, args: any): + self.args = args + + @classmethod + def get_name(cls, op_node: TorchOpNode) -> str: + return op_node.name + + @classmethod + def get_full_name(cls, op_node: TorchOpNode) -> str: + if isinstance(op_node.origin_input_shape, list): + data = [] + for dim in op_node.origin_input_shape: + data.append(','.join([str(x) for x in dim])) + input_shape = ';\r\n'.join(data) + return f'{op_node.name}{input_shape}' + return f'{op_node.name}{op_node.input_shape}' + + def get_name_func(self): + if not self.args.op_name_map and not self.args.use_input_shape: + name_func = self.get_name + elif self.args.op_name_map and not self.args.use_input_shape: + name_func = self.get_map_name + elif self.args.op_name_map and not self.args.use_input_shape: + name_func = self.get_full_name + else: + name_func = self.get_full_map_name + return name_func + + def get_map_name(self, op_node: TorchOpNode) -> str: + return self.args.op_name_map.get(op_node.name, op_node.name) + + def get_full_map_name(self, op_node: TorchOpNode) -> str: + if isinstance(op_node.origin_input_shape, list): + data = [] + for dim in op_node.origin_input_shape: + data.append(','.join([str(x) for x in dim])) + input_shape = ';\r\n'.join(data) + return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{input_shape}' + return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{op_node.input_shape}' + + def get_module_name(self, module: ModuleNode) -> str: + if not self.args.op_name_map: + return module.module_name + module = module.module_name + for old_name, new_name in self.args.op_name_map.items(): + module.replace(old_name, new_name) + return module diff --git a/profiler/compare_tools_review/compare_backend/utils/torch_op_node.py b/profiler/compare_tools_review/compare_backend/utils/torch_op_node.py new file mode 100644 index 0000000000000000000000000000000000000000..690c46cd51c1e2991b0bfaf44e9af431cdad5151 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/torch_op_node.py @@ -0,0 +1,92 @@ +from compare_backend.compare_bean.origin_data_bean.compare_event import MemoryEvent +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.constant import Constant + + +class TorchOpNode: + def __init__(self, event=TraceEventBean, parent_node=None): + self._event = event + self._parent_node = parent_node + self._child_nodes = [] + self._kernel_list = [] + self._kernel_num = 0 + self._memory_allocated_list = [] + + @property + def start_time(self): + return self._event.start_time + + @property + def end_time(self): + return self._event.end_time + + @property + def name(self): + return self._event.name + + @property + def input_shape(self): + return str(self._event.args.get("Input Dims", Constant.NA)) + + @property + def origin_input_shape(self): + return self._event.args.get("Input Dims", Constant.NA) + + @property + def input_type(self): + return str(self._event.args.get("Input type", Constant.NA)) + + @property + def call_stack(self): + return str(self._event.args.get("Call stack", Constant.NA)) + + @property + def parent(self): + return self._parent_node + + @property + def child_nodes(self): + return self._child_nodes + + @property + def kernel_list(self): + return self._kernel_list + + @property + def kernel_num(self): + return self._kernel_num + + @property + def memory_allocated(self): + return self._memory_allocated_list + + @property + def device_dur(self): + return sum([kernel.device_dur for kernel in self._kernel_list]) + + def add_child_node(self, child_node): + self._child_nodes.append(child_node) + + def set_kernel_list(self, kernel_list: list): + if not kernel_list: + return + self._kernel_list.extend(kernel_list) + kernel_num = len(kernel_list) + cur_node = self + while cur_node._parent_node: + cur_node._kernel_num += kernel_num + cur_node = cur_node._parent_node + + def update_kernel_list(self, kernel_list: list): + if not kernel_list: + return + self._kernel_list.extend(kernel_list) + + def set_memory_allocated(self, memory_allocated: MemoryEvent): + self._memory_allocated_list.append(memory_allocated) + + def is_step_profiler(self) -> bool: + return self._event.is_step_profiler() + + def get_op_info(self) -> list: + return [self.name, self.input_shape, self.input_type, self.call_stack] diff --git a/profiler/compare_tools_review/compare_backend/utils/tree_builder.py b/profiler/compare_tools_review/compare_backend/utils/tree_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..34c1fe1a1f4046d1e60af107f5ee74484424174a --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/tree_builder.py @@ -0,0 +1,82 @@ +from queue import Queue + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.torch_op_node import TorchOpNode + + +class TreeBuilder: + @classmethod + def build_tree(cls, event_list: list, kernel_dict: dict, memory_list: list) -> TorchOpNode: + root_node = TorchOpNode() + all_event_list = [] + all_event_list.extend(event_list) + all_event_list.extend(memory_list) + all_event_list.sort(key=lambda x: x.start_time) + last_node = root_node + for event in all_event_list: + while last_node: + if last_node != root_node and event.start_time > last_node.end_time: + last_node = last_node.parent + continue + if event.is_torch_op: + tree_node = TorchOpNode(event, last_node) + last_node.add_child_node(tree_node) + last_node = tree_node + tree_node.set_kernel_list(kernel_dict.get(event.start_time, [])) + else: + event.set_name(last_node.name) + last_node.set_memory_allocated(event) + break + return root_node + + @classmethod + def get_total_kernels(cls, root_node: TorchOpNode) -> list: + result_list = [] + result_list.extend(root_node.kernel_list) + node_queue = Queue() + for child_node in root_node.child_nodes: + node_queue.put(child_node) + while not node_queue.empty(): + tree_node = node_queue.get() + result_list.extend(tree_node.kernel_list) + for child_node in tree_node.child_nodes: + node_queue.put(child_node) + return result_list + + @classmethod + def get_total_memory(cls, root_node: TorchOpNode) -> list: + result_list = [] + result_list.extend(root_node.memory_allocated) + node_queue = Queue() + for child_node in root_node.child_nodes: + node_queue.put(child_node) + while not node_queue.empty(): + tree_node = node_queue.get() + result_list.extend(tree_node.memory_allocated) + for child_node in tree_node.child_nodes: + node_queue.put(child_node) + return result_list + + @classmethod + def build_module_tree(cls, event_list: list, kernel_dict: dict): + root_node = ModuleNode(TraceEventBean({})) + event_list.sort(key=lambda x: x.start_time) + last_node = root_node + for event in event_list: + while last_node: + if last_node != root_node and event.start_time > last_node.end_time: + last_node = last_node.parent_node + continue + if event.is_x_mode(): + tree_node = ModuleNode(event, last_node) + last_node.update_child_nodes(tree_node) + last_node = tree_node + break + if last_node == root_node: + break + kernel_list = kernel_dict.get(event.start_time, []) + if kernel_list: + last_node.update_kernel_list(event.start_time, kernel_list) + break + return root_node diff --git a/profiler/compare_tools_review/compare_backend/view/__init__.py b/profiler/compare_tools_review/compare_backend/view/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/view/base_view.py b/profiler/compare_tools_review/compare_backend/view/base_view.py new file mode 100644 index 0000000000000000000000000000000000000000..d18980b7de2098b5a1015d14fbd1b5be91a23bfc --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/view/base_view.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod + + +class BaseView(ABC): + def __init__(self, data_dict: dict): + self._data_dict = data_dict + + @abstractmethod + def generate_view(self): + raise NotImplementedError("Function generate_view need to be implemented.") diff --git a/profiler/compare_tools_review/compare_backend/view/excel_view.py b/profiler/compare_tools_review/compare_backend/view/excel_view.py new file mode 100644 index 0000000000000000000000000000000000000000..73b82b1cd31d7e8207e34a040e484f6387fb8694 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/view/excel_view.py @@ -0,0 +1,22 @@ +import os + +from xlsxwriter import Workbook + +from compare_backend.view.base_view import BaseView +from compare_backend.view.work_sheet_creator import WorkSheetCreator +from compare_backend.utils.constant import Constant + + +class ExcelView(BaseView): + + def __init__(self, data_dict: dict, file_path: str, args: any): + super().__init__(data_dict) + self._file_path = file_path + self._args = args + + def generate_view(self): + workbook = Workbook(self._file_path) + for sheet_name, data in self._data_dict.items(): + WorkSheetCreator(workbook, sheet_name, data, self._args).create_sheet() + workbook.close() + os.chmod(self._file_path, Constant.FILE_AUTHORITY) diff --git a/profiler/compare_tools_review/compare_backend/view/screen_view.py b/profiler/compare_tools_review/compare_backend/view/screen_view.py new file mode 100644 index 0000000000000000000000000000000000000000..150b36c6feda79cafacd7e4980624cd51e116912 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/view/screen_view.py @@ -0,0 +1,19 @@ +from prettytable import PrettyTable + +from compare_backend.view.base_view import BaseView + + +class ScreenView(BaseView): + def __init__(self, data_dict: dict): + super().__init__(data_dict) + + def generate_view(self): + for sheet_name, data in self._data_dict.items(): + if not data.get("rows", []): + return + table = PrettyTable() + table.title = sheet_name + table.field_names = data.get("headers", []) + for row in data.get("rows", []): + table.add_row(row) + print(table) diff --git a/profiler/compare_tools_review/compare_backend/view/work_sheet_creator.py b/profiler/compare_tools_review/compare_backend/view/work_sheet_creator.py new file mode 100644 index 0000000000000000000000000000000000000000..7a33168da377ae77ab64fff0886e09eef065b4e2 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/view/work_sheet_creator.py @@ -0,0 +1,60 @@ +from xlsxwriter import Workbook + +from compare_backend.utils.excel_config import ExcelConfig, CellFormatType + + +class WorkSheetCreator: + def __init__(self, work_book: Workbook, sheet_name: str, data: dict, args: any): + self._work_book = work_book + self._sheet_name = sheet_name + self._data = data + self._args = args + self._work_sheet = None + self._row_id = 1 + self._field_format = {} + self._diff_ratio_index = None + self._col_ids = "ABCDEFGHIJKLMNOPQRSTUVW" + + def create_sheet(self): + if not self._data.get("rows", []): + return + self._work_sheet = self._work_book.add_worksheet(self._sheet_name) + self._write_headers() + self._write_data() + + def _write_headers(self): + base_header_format = self._work_book.add_format(CellFormatType.GREEN_BOLD) + com_header_format = self._work_book.add_format(CellFormatType.YELLOW_BOLD) + com_index_range = [-1, -1] + overhead = self._data.get("overhead", []) + if overhead: + base_path = f"Base Profiling: {self._args.base_profiling_path}" + self._work_sheet.merge_range(overhead[0], base_path, base_header_format) + com_index_range = [self._col_ids.index(overhead[1].split(":")[0][0]), + self._col_ids.index(overhead[1].split(":")[1][0])] + comparison_path = f"Comparison Profiling: {self._args.comparison_profiling_path}" + self._work_sheet.merge_range(overhead[1], comparison_path, com_header_format) + self._row_id += 2 + for index, header in enumerate(self._data.get("headers")): + if index in range(com_index_range[0], com_index_range[1] + 1): + header_format = com_header_format + else: + header_format = base_header_format + col_id = self._col_ids[index] + self._work_sheet.set_column(f"{col_id}:{col_id}", header.get("width")) + self._work_sheet.write(f"{col_id}{self._row_id}", header.get("name"), header_format) + self._field_format[index] = self._work_book.add_format(header.get("type")) + if header.get("name") in (ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO): + self._diff_ratio_index = index + self._row_id += 1 + + def _write_data(self): + red_ratio_format = self._work_book.add_format(CellFormatType.RED_RATIO) + for data in self._data.get("rows"): + for index, cell_data in enumerate(data): + cell_format = self._field_format.get(index) + if index == self._diff_ratio_index and cell_data and cell_data > 1: + cell_format = red_ratio_format + cell_data = "INF" if cell_data == float('inf') else cell_data + self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format) + self._row_id += 1 diff --git a/profiler/compare_tools_review/compare_interface/__init__.py b/profiler/compare_tools_review/compare_interface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_interface/comparison_interface.py b/profiler/compare_tools_review/compare_interface/comparison_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..919095b310126f2ce0c9c3e6912fb10f24d149e9 --- /dev/null +++ b/profiler/compare_tools_review/compare_interface/comparison_interface.py @@ -0,0 +1,31 @@ +import sys +import os + +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "cluster_analyse")) +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from compare_backend.comparison_generator import ComparisonGenerator +from compare_backend.disaggregate.overall_perf_interface import OverallPerfInterface +from compare_backend.utils.compare_args import Args +from compare_backend.utils.constant import Constant + + +class ComparisonInterface: + def __init__(self, base_profiling_path: str, comparison_profiling_path: str = ""): + self.base_profiling_path = base_profiling_path + if comparison_profiling_path: + self._args = Args(base_profiling_path=base_profiling_path, + comparison_profiling_path=comparison_profiling_path) + + def compare(self, compare_type: str) -> dict: + if compare_type == Constant.OVERALL_COMPARE: + self._args.enable_profiling_compare = True + + return ComparisonGenerator(self._args).run_interface(compare_type) + + def disaggregate_perf(self, compare_type: str) -> dict: + if compare_type != Constant.OVERALL_COMPARE: + print('[ERROR] Invalid compare_type value: {compare_type} which not supported.') + return {} + return OverallPerfInterface(self.base_profiling_path).run() diff --git a/profiler/compare_tools_review/performance_compare.py b/profiler/compare_tools_review/performance_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..4676355a28de80d73f01f75b23b102ebf4ff1a79 --- /dev/null +++ b/profiler/compare_tools_review/performance_compare.py @@ -0,0 +1,36 @@ +import argparse +import ast +import datetime +import os.path +import sys + +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "cluster_analyse")) + +from compare_backend.comparison_generator import ComparisonGenerator + + +def main(): + parser = argparse.ArgumentParser(description="Compare trace of GPU and NPU") + parser.add_argument("base_profiling_path", type=str, default='', help="基准性能数据的文件路径") + parser.add_argument("comparison_profiling_path", type=str, default='', help="比较性能数据的文件路径") + parser.add_argument("--enable_profiling_compare", default=False, action='store_true', help="开启总体性能比较") + parser.add_argument("--enable_operator_compare", default=False, action='store_true', help="开启算子性能比较") + parser.add_argument("--enable_memory_compare", default=False, action='store_true', help="开启算子内存比较") + parser.add_argument("--enable_communication_compare", default=False, action='store_true', help="开启通信性能比较") + parser.add_argument("--output_path", type=str, default='', help="性能数据比对结果的存放路径") + parser.add_argument("--max_kernel_num", type=int, help="每个torch op的kernel数量限制") + parser.add_argument("--op_name_map", type=ast.literal_eval, default={}, + help="配置GPU与NPU等价的算子名称映射关系,以字典的形式传入") + parser.add_argument("--use_input_shape", default=False, action='store_true', help="开启算子的精准匹配") + parser.add_argument("--gpu_flow_cat", type=str, default='', help="gpu flow event的分类标识") + args = parser.parse_args() + + ComparisonGenerator(args).run() + + +if __name__ == "__main__": + start_time = datetime.datetime.now() + main() + end_time = datetime.datetime.now() + print(f'[INFO] The comparison task has been completed in a total time of {end_time - start_time}') diff --git a/profiler/merge_profiling_timeline_review/README.md b/profiler/merge_profiling_timeline_review/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5075f6bc2fcc8bf04b435562f28a50229b92362e --- /dev/null +++ b/profiler/merge_profiling_timeline_review/README.md @@ -0,0 +1,115 @@ +# 合并大json工具 + +merge_profiling_timeline(合并大json工具)支持合并Profiling的timeline数据,支持合并指定rank的timline、合并指定timeline中的item。 + + +## 多timeline融合 + +### 性能数据采集 + +使用Ascend PyTorch Profiler或者E2E性能采集工具采集性能数据,E2E profiling将被废弃,不建议使用。Ascend PyTorch Profiler采集方式参考:[Profiling数据采集](https://gitee.com/ascend/att/tree/master/profiler)。将采集到的所有节点的性能数据拷贝到当前环境同一目录下,以下假设数据在/home/test/cann_profiling下。 + +E2E Profiling数据目录结构示例如下: + +```bash +|- cann_profiling + |- PROF_*** + |- timeline + |- msprof.json + |- device_* + |- info.json.* + ... + |- PROF_*** + ... +``` + +Ascend PyTorch Profiler数据目录结构示例如下: + +```bash +|- ascend_pytorch_profiling + |- **_ascend_pt + |- ASCEND_PROFILER_OUTPUT + |- trace_view.json + |- FRAMEWORK + |- PROF_*** + |- **_ascend_pt +``` + +### 参数说明 + +| 参数名称 | 说明 | 是否必选 | +| -------- | ------------------------------------------------------------ | -------- | +| -i | 指定Profiling数据目录路径。 | 是 | +| --type | 指定需要合并timeline场景,可选取值:`pytorch`(通过Ascend PyTorch Profiler方式采集profiling数据,合并所有卡的trace_view.json)、`e2e`(通过E2E Profiling方式采集Profiling数据,优先合并总timeline,没有生成则选择合并device目录下的msprof_*.json)、`custom` (自定义需要合并的timeline数据,具体参考**使用示例**)。 | 是 | +| -o | 指定合并后的timeline文件输出的路径(路径末尾可以设置文件名,具体用法参考**使用示例**),不设置该参数的情况下默认文件输出的路径为当前目录(默认文件名为merged.json)。 | 否 | +| --rank | 指定需要合并timeline的Rank ID,默认全部合并。 | 否 | +| --items | 指定需要合并的Profiling数据项,包括:python、Ascend Hardware、CANN、HCCL、PTA、Overlap Analysis,默认全部合并。 | 否 | + +### 使用示例 + +1. 合并单机多卡timeline,默认合并所有卡、所有数据项,生成first.json在path/to/cann_profiling/output/目录下 + + ```bash + python3 main.py -i path/to/cann_profiling/ -o path/to/cann_profiling/output/first --type pytorch + ``` + +2. 合并单机多卡timeline,默认合并所有卡、所有数据项,不设置-o参数时默认生成merge.json在当前目录下 + + ```bash + python3 main.py -i path/to/cann_profiling/ --type pytorch + ``` + +3. 合并单机多卡timeline,只合并0卡和1卡 + + ```bash + python3 main.py -i path/to/cann_profiling/ -o path/to/cann_profiling/output/2p --type pytorch --rank 0,1 + ``` + +4. 合并单机多卡timeline,合并所有卡的CANN层和Ascend_Hardware层数据 + + ```bash + python3 main.py -i path/to/cann_profiling/ --type pytorch --items "CANN,Ascend Hardware" + ``` + +5. 合并多timeline(自定义) + + 以上场景不支持的情况下,可以使用自定义的合并方式,将需要合并的timeline文件放在同一目录下(附:该场景比较特殊,与正常合并不同,无法直接读取info.json中的rank_id,因此该场景下的rank_id为默认分配的序号,用于区分不同文件的相同层,不代表实际rank_id) + 数据目录结构示意如下: + + ```bash + |- timeline + |- msprof_0.json + |- msprof_1.json + |- msprof_2.json + |- hccl_3.json + |- hccl_4.json + ... + ``` + + 通过下面的命令合并所有timeline,同样支持-o、--rank、--items等参数。 + + ```bash + python3 main.py -i path/to/timeline/ -o path/to/timeline/xxx --type custom + ``` + + 合并timeline查看:在 -o 指定的目录(不设置-o时默认在当前目录下的merged.json)的xxx.json为合并后的文件。 + + +## 超大timeline文件查看 + +[下载whl](https://gitee.com/aerfaliang/trace_processor/releases/download/trace_processor_37.0/trace_processor-37.0-py3-none-any.whl)包并执行如下命令安装(windows): + +```bash +pip3 install trace_processor-37.0-py3-none-any.whl +``` + +安装完成后直接执行如下命令: + +```bash +python -m trace_processor --httpd path/to/xxx_merged.json +``` + +等待加载完毕,刷新[perfetto](https://ui.perfetto.dev/)界面,单击Use old version regardless,再单击`YES, use loaded trace`即可展示timeline(通过W放大、S缩小、A左移、D右移来查看timeline文件)。 + + + \ No newline at end of file diff --git a/profiler/merge_profiling_timeline_review/main.py b/profiler/merge_profiling_timeline_review/main.py new file mode 100644 index 0000000000000000000000000000000000000000..678f5d5a8f7be8c45d6c4935f2941bd716d77a78 --- /dev/null +++ b/profiler/merge_profiling_timeline_review/main.py @@ -0,0 +1,233 @@ +#! /usr/bin/python3 +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re + +from functools import partial +from argparse import ArgumentParser +from decimal import Decimal + + +FILTER_DIRS = [".profiler", "HCCL_PROF", "timeline", "query", 'sqlite', 'log'] +RANK_ID_POS = 1000 + +def get_path_dir(path: str) -> list: + """ + check result path exist JOB dir + path : result path + """ + path_dir_filter = filter(partial(_path_dir_filter_func, root_dir=path), os.listdir(path)) + sub_dirs = list(path_dir_filter) + return sub_dirs + + +def _path_dir_filter_func(sub_path, root_dir): + return sub_path not in FILTER_DIRS and os.path.isdir(os.path.realpath(os.path.join(root_dir, sub_path))) + + +def natural_sort(files): + convert = lambda text: int(text) if text.isdigit() else text.lower() + alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] + return sorted(files, key=alphanum_key) + + +def get_timeline_info(args, prof_dirs): + timeline_info = {} + + for prof in prof_dirs: + pro_path = os.path.join(args.input, prof) + + # 从info.json读取rank_id + rank_id = get_rank_id_from_info_json(pro_path) + if rank_id is None: + print(f"WARN, There is not rank id info in {pro_path}") + continue + + timeline_path = get_timeline_path(pro_path, args.type) + + if os.path.exists(timeline_path): + timeline_info[rank_id] = timeline_path + else: + print(f"WARN, The file \"{timeline_path}\" does not exist.") + return timeline_info + + +def get_timeline_path(pro_path, type): + for root, dirs, files in os.walk(pro_path): + for dir_ in dirs: + if 'ASCEND_PROFILER_OUTPUT' == dir_ and type == 'pytorch': + timeline_path = os.path.realpath(os.path.join(root, dir_, 'trace_view.json')) + return timeline_path + + for file_ in sorted(files, reverse=True): + if 'msprof' in file_: + timeline_path = os.path.join(root, file_) + return timeline_path + return + +def get_rank_id_from_info_json(pro_path): + info_json = "" + rank_id = None + for root, dirs, files in os.walk(pro_path): + for file in files: + if "info.json." in file and ".done" not in file: + info_json = os.path.join(root, file) + break + + if info_json: + if os.path.islink(info_json): + print(f"The file: \"{info_json}\" is link. Please check the path.") + return + try: + with open(info_json, "r+") as f: + info = json.load(f) + rank_id = info.get("rank_id") + except Exception as err: + print("[ERROR] %s" % err) + return + return rank_id + + +def merge_timeline_general(args): + """合并e2e profiling生成的msprof*.json""" + if not os.path.isdir(args.input): + print(f"No such file or directory: \"{args.input}\". Please check the path.") + return + prof_dir = get_path_dir(args.input) + if not prof_dir: + message = f"The path \"{args.input}\" does not have PROF dir. Please check the path." + print(message) + return + timeline_info = get_timeline_info(args, prof_dir) + timeline_files_dict = {} + + # 合并部分profiling items + process_list = args.items.split(",") if args.items else None + + # 合并部分rank + if args.rank: + rank_ids = [int(rank_id) for rank_id in args.rank.split(",")] + else: + rank_ids = list(timeline_info.keys()) + + for rank_id in rank_ids: + if not timeline_info.get(rank_id): + print(f"main.py: error rank_id '{rank_id}' ") + return + timeline_files_dict[rank_id] = timeline_info.get(rank_id) + merge_timeline_events(timeline_files_dict, process_list) + + +def merge_timeline_custom(args): + """合并指定目录里所有timeline文件""" + timeline_files = natural_sort(os.listdir(args.input)) + timeline_files_dict = {} + for idx, timeline_file in enumerate(timeline_files): + timeline_files_dict[idx] = os.path.join(args.input, timeline_file) + # 合并部分profiling items + process_list = args.items.split(",") if args.items else None + merge_timeline_events(timeline_files_dict, process_list) + + +def merge_timeline_events(timeline_file_dict, process_list): + """ + 输入需要合并的timeline文件路径及对应的rank_id/id、需要合并的process_list + 输出合并timeline + """ + new_events = [] + for rank_id, timeline_path in timeline_file_dict.items(): + node = rank_id // 8 + print("rank id: ", rank_id, "timeline file: ", timeline_path) + if os.path.islink(timeline_path): + print(f"The file: \"{timeline_path}\" is link. Please check the path.") + return + try: + with open(timeline_path, 'r+') as f: + cur_events = json.load(f) + except Exception as err: + print("[ERROR] %s" % err) + return + + proc_pid_dict = {} + for event in cur_events: + if event.get("name") == "process_name" and event.get("ph") == "M": + if event.get("args"): + proc_pid_dict[event["args"].get("name")] = event.get("pid") + process_list_tmp = process_list if process_list else list(proc_pid_dict.keys()) + # 提取待合并的items的pid + merged_pids = set() + for pro in process_list_tmp: + if pro not in proc_pid_dict.keys(): + print(f"main.py: error argument --items: invalid choice: '{pro}' (choose from {list(proc_pid_dict.keys())})") + return + merged_pids.add(proc_pid_dict.get(pro)) + + for event in cur_events: + + # 只合并特定数据项 + if merged_pids and event.get('pid') not in merged_pids: + continue + + # convert tid to int + if not isinstance(event['tid'], int): + print(f"[WARNNING] {event['tid']} is not int type") + + # 进程名加上rank_id区分不同rank + if event.get("name") == "process_name" and event.get("ph") == "M": + if event.get("args") is not None and event["args"].get("name") is not None: + event["args"]["name"] = event["args"]["name"] + f"_{rank_id}" + + #modify connect id + if event.get('id') and (event.get('ph') == 's' or event.get('ph') == 'f'): + event['id'] = float(event.get('id')) * RANK_ID_POS + rank_id + + new_events.append(event) + out_path = f"{args.output}.json" + if os.path.islink(out_path): + print(f"The file: \"{out_path}\" is link. Please check the path.") + return + if os.path.exists(out_path): + print(f"File {out_path} existed before and is now overwritten.") + os.remove(out_path) + try: + # 设置文件权限为640,安全考虑 + with os.fdopen(os.open(out_path, os.O_WRONLY | os.O_CREAT, 0o640), 'w') as f: + json.dump(new_events, f) + except FileNotFoundError: + print(f"Param -o (output path) is not exists, please check it.") + return + print(f"timeline merged output path: {out_path}") + + +def parse_args(): + parser = ArgumentParser(description="Merge timeline for multi card") + parser.add_argument("-i", "--input", default=None, help="root dir of PROF_* data") + parser.add_argument("-o", "--output", default="./merged", help="save path of merged.json ") + parser.add_argument("--rank", default=None, help="List of ranks to be merged. By default, all ranks are merged") + parser.add_argument("--items", default=None, help="Specify the data items (python,CANN,Ascend Hardware,HCCL,..)to be merged. in the timeline.") + parser.add_argument("--type", choices=('pytorch', 'e2e', 'custom'), help="Customize the timeline file to be merged.") + arg = parser.parse_args() + return arg + + +if __name__ == "__main__": + args = parse_args() + print("========================== start merge timeline ====================") + if args.type == "custom": + merge_timeline_custom(args) + else: + merge_timeline_general(args) \ No newline at end of file diff --git "a/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2761.png" "b/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2761.png" new file mode 100644 index 0000000000000000000000000000000000000000..beef396ce2996c25ecd74298285ccab5011ddea1 Binary files /dev/null and "b/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2761.png" differ diff --git "a/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2762.png" "b/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2762.png" new file mode 100644 index 0000000000000000000000000000000000000000..48793f136e48f21f618ff3cb13bdcc3388f76930 Binary files /dev/null and "b/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2762.png" differ diff --git a/profiler/prof_common_review/__init__.py b/profiler/prof_common_review/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/prof_common_review/analyze_dict.py b/profiler/prof_common_review/analyze_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..a06577e8fb49436f7b867e8e74495cc76a6a58b2 --- /dev/null +++ b/profiler/prof_common_review/analyze_dict.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +class AnalyzeDict(dict): + def __getstate__(self): + return self.__dict__ + + def __setstate__(self, d): + self.__dict__.update(d) + + def __getattr__(self, key: str): + if key not in self: + return {} + + value = self[key] + if isinstance(value, dict): + value = AnalyzeDict(value) + return value diff --git a/profiler/prof_common_review/constant.py b/profiler/prof_common_review/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..5789b89cb1a248977b64839339395acc5288b2ab --- /dev/null +++ b/profiler/prof_common_review/constant.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +class Constant(object): + COLLECTION_PATH = "collection_path" + ANALYSIS_MODE = "analysis_mode" + CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help']) \ No newline at end of file