diff --git a/profiler/advisor_review/README.md b/profiler/advisor_review/README.md new file mode 100644 index 0000000000000000000000000000000000000000..283aa2943881262ffbefaeb7025cf301c17b18fa --- /dev/null +++ b/profiler/advisor_review/README.md @@ -0,0 +1,80 @@ +# advisor + +msprof-analyze的advisor功能是将Ascend PyTorch Profiler或者msprof采集的PyThon场景性能数据进行分析,并输出性能调优建议(当前暂不支持对db格式文件分析)。 + +## 工具使用(命令行方式方式) + +1. 参见《[性能工具](../README.md)》完成工具安装。建议安装最新版本。 + +2. 执行分析。 + + - 总体性能瓶颈 + + ```bash + msprof-analyze advisor all -d [待分析性能数据文件所在路径] -bp [基准性能数据文件所在路径] + ``` + + - 计算瓶颈 + + ```bash + msprof-analyze advisor computation -d [待分析性能数据文件所在路径] + ``` + + - 调度瓶颈 + + ```bash + msprof-analyze advisor schedule -d [待分析性能数据文件所在路径] + ``` + + + -d(必选):待分析性能数据文件所在路径。 + + -bp(可选):基准性能数据文件所在路径。 + + 单卡场景需要指定到性能数据文件`*_ascend_pt`目录;多卡或集群场景需要指定到`*_ascend_pt`目录的父目录层级。 + +3. 查看结果。 + + 分析结果打屏展示并生成html和csv文件。 + +## 工具使用(Jupyter Notebook方式) + +Jupyter Notebook使用方式如下: + +下列以Windows环境下执行为例介绍。 + +1. 在环境下安装Jupyter Notebook工具。 + + ```bash + pip install jupyter notebook + ``` + + Jupyter Notebook工具的具体安装和使用指导请至Jupyter Notebook工具官网查找。 + +2. 在环境下安装ATT工具。 + + ``` + git clone https://gitee.com/ascend/att.git + ``` + + 安装环境下保存Ascend PyTorch Profiler采集的性能数据。 + +3. 进入att\profiler\advisor目录执行如下命令启动Jupyter Notebook工具。 + + ```bash + jupyter notebook + ``` + + 执行成功则自动启动浏览器读取att\profiler\advisor目录,如下示例: + + ![jupyter_report](./img/jupyter_report.PNG) + + 若在Linux环境下则回显打印URL地址,即是打开Jupyter Notebook工具页面的地址,需要复制URL,并使用浏览器访问(若为远端服务器则需要将域名“**localhost**”替换为远端服务器的IP),进入Jupyter Notebook工具页面。 + +4. 每个.ipynb文件为一项性能数据分析任务,选择需要的.ipynb打开,并在*_path参数下拷贝保存Ascend PyTorch Profiler采集的性能数据的路径。如下示例: + + ![advisor_result](./img/advisor_result.PNG) + +5. 单击运行按钮执行性能数据分析。 + + 分析结果详细内容会在.ipynb页面下展示。 diff --git a/profiler/advisor_review/__init__.py b/profiler/advisor_review/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e79018ed05c6d1cdeb56feaa6182f048e3c8e06f --- /dev/null +++ b/profiler/advisor_review/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from profiler.advisor.interface.interface import Interface \ No newline at end of file diff --git a/profiler/advisor_review/advisor_backend/__init__.py b/profiler/advisor_review/advisor_backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0e9f748f4b10347a874f60cec1fa9f6e5285a5e --- /dev/null +++ b/profiler/advisor_review/advisor_backend/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/profiler/advisor_review/advisor_backend/advice_base.py b/profiler/advisor_review/advisor_backend/advice_base.py new file mode 100644 index 0000000000000000000000000000000000000000..35939bcea9c87fb09f2113bd19f77ea18ba54e34 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/advice_base.py @@ -0,0 +1,50 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod + + +class AdviceBase: + DATA = "data" + BOTTLENECK = "bottleneck" + ADVICE = "advice" + + def __init__(self, collection_path: str): + self.collection_path = os.path.realpath(collection_path) + self.bottelneck = '' + self.output_format_data = { + self.DATA: [], + self.BOTTLENECK: '', + self.ADVICE: '' + } + + @abstractmethod + def path_check(self): + """ + check whether input path is valid + """ + + @abstractmethod + def run(self): + """ + analyze profiling data and advice + """ + + @abstractmethod + def output(self): + """ + output relevant data + """ \ No newline at end of file diff --git a/profiler/advisor_review/advisor_backend/advice_factory/__init__.py b/profiler/advisor_review/advisor_backend/advice_factory/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0e9f748f4b10347a874f60cec1fa9f6e5285a5e --- /dev/null +++ b/profiler/advisor_review/advisor_backend/advice_factory/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..639f4800cfe8c9acdc8fe7ea5f65a43fc8892b2b --- /dev/null +++ b/profiler/advisor_review/advisor_backend/advice_factory/advice_factory.py @@ -0,0 +1,50 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from common_func.path_manager import PathManager + + +class AdviceFactory: + def __init__(self, collection_path: str): + self.collection_path = os.path.realpath(collection_path) + + @staticmethod + def run_advice(self, advice: str, kwargs: dict): + """ + run advice to produce data + """ + + def produce_advice(self, advice: str, kwargs: dict): + """ + produce data for input mode and advice + """ + self.path_check() + self.advice_check(advice) + return self.run_advice(advice, kwargs) + + def path_check(self): + """ + check whether input path is valid + """ + PathManager.input_path_common_check(self.collection_path) + + def advice_check(self, advice: str): + """ + check whether input advice is valid + """ + if advice not in self.ADVICE_LIB.keys(): + msg = '[ERROR]Input advice is illegal.' + raise RuntimeError(msg) diff --git a/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..6bb93f46704eb13fef14d070f891e350446829ea --- /dev/null +++ b/profiler/advisor_review/advisor_backend/advice_factory/cluster_advice_factory.py @@ -0,0 +1,38 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from advice_factory.advice_factory import AdviceFactory +from cluster_advice.slow_link_advice import SlowLinkAdvice +from cluster_advice.slow_rank_advice import SlowRankAdvice +from cluster_advice.cluster_pipeline_advice import ClusterPipelineAdvice +from cluster_advice.kernel_cluster_advice import KernelClusterAdvice +from common_func_advisor.constant import Constant + + +class ClusterAdviceFactory(AdviceFactory): + ADVICE_LIB = { + Constant.SLOW_RANK: SlowRankAdvice, + Constant.SLOW_LINK: SlowLinkAdvice, + Constant.PIPELINE: ClusterPipelineAdvice, + Constant.KERNEL: KernelClusterAdvice + } + + def __init__(self, collection_path: str): + super().__init__(collection_path) + + def run_advice(self, advice: str, kwargs: dict): + """ + run advice to produce data + """ + return self.ADVICE_LIB.get(advice)(self.collection_path, kwargs).run() diff --git a/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..336bef7dd8553eb82586d52260443a7d01e84ab0 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/advice_factory/compute_advice_factory.py @@ -0,0 +1,34 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from common_func_advisor.constant import Constant +from advice_factory.advice_factory import AdviceFactory +from compute_advice.npu_fused_advice import NpuFusedAdvice +from compute_advice.npu_slow_advice import NpuSlowAdvice + + +class ComputeAdviceFactory(AdviceFactory): + ADVICE_LIB = { + Constant.NPU_FUSED: NpuFusedAdvice, + Constant.NPU_SLOW: NpuSlowAdvice, + } + + def __init__(self, collection_path: str): + super().__init__(collection_path) + + def run_advice(self, advice: str, kwargs: dict): + """ + run advice to produce data + """ + return self.ADVICE_LIB.get(advice)(self.collection_path).run() diff --git a/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..baf80cc200f4c3cd1057b7fc28e750948a450cf1 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/advice_factory/overall_advice_factory.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from advice_factory.advice_factory import AdviceFactory +from common_func_advisor.constant import Constant +from overall_advice.overall_summary_advice import OverallSummaryAdvice + + +class OverallAdviceFactory(AdviceFactory): + ADVICE_LIB = { + Constant.SUMMARY: OverallSummaryAdvice + } + + def __init__(self, collection_path: str): + super().__init__(collection_path) + + def run_advice(self, advice: str, kwargs: dict): + """ + run advice to produce data + """ + return self.ADVICE_LIB.get(advice)(self.collection_path, kwargs).run() diff --git a/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py b/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..44b352e95a7bb1007bc7373193603c2a0b9d8b6c --- /dev/null +++ b/profiler/advisor_review/advisor_backend/advice_factory/timeline_advice_factory.py @@ -0,0 +1,34 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from advice_factory.advice_factory import AdviceFactory +from common_func_advisor.constant import Constant +from timeline_advice.optimizer_advice import OptimizerAdvice +from timeline_advice.op_schedule_advice import OpScheduleAdvice + + +class TimelineAdviceFactory(AdviceFactory): + ADVICE_LIB = { + Constant.OPTIM: OptimizerAdvice, + Constant.OP_SCHE: OpScheduleAdvice, + } + + def __init__(self, collection_path: str): + super().__init__(collection_path) + + def run_advice(self, advice: str, kwargs: dict): + """ + run advice to produce data + """ + return self.ADVICE_LIB.get(advice)(self.collection_path).run() diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py b/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor_review/advisor_backend/cluster_advice/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py new file mode 100644 index 0000000000000000000000000000000000000000..e9be4675963a9cd48da3b4cd91ee646f8e82468b --- /dev/null +++ b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_advice_base.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from common_func.constant import Constant +from advice_base import AdviceBase +from cluster_analysis import Interface + + +class ClusterAdviceBase(AdviceBase): + def __init__(self, collection_path: str): + super().__init__(collection_path) + + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean + + def path_check(self): + """ + check whether input path is valid + """ + for file in os.listdir(self.collection_path): + if file == 'cluster_analysis_output': + print("[INFO]Cluster has been analyzed " + "because of the existence of cluster analysis output directory.") + print("[INFO]Skip Cluster analyze backend.") + return + print("[INFO] cluster analysis is in the process, please wait...") + self.cluster_analyze() + + def cluster_analyze(self): + parameter = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.ANALYSIS_MODE: "all" + } + try: + Interface(parameter).run() + except Exception as e: + raise ValueError(f"Cluster analyze backend failed:{e}") from e + + @abstractmethod + def run(self): + """ + analyze profiling data and advice + """ + + @abstractmethod + def output(self): + """ + output relevant data + """ \ No newline at end of file diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..7f8846f1d99e9bc81636df32d04148df99d12920 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/cluster_advice/cluster_pipeline_advice.py @@ -0,0 +1,437 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import multiprocessing +from typing import Dict +from typing import Optional +from typing import Deque +from typing import List +from typing import Tuple +from collections import defaultdict +from collections import deque +from decimal import Decimal +from dataclasses import dataclass + +from common_func.file_manager import FileManager +from common_func_advisor.constant import Constant +from common_func_advisor.trace_view_preprocessor import FineTraceViewData +from common_func_advisor.trace_view_preprocessor import TraceViewPreProcessor +from cluster_advice.cluster_advice_base import ClusterAdviceBase +from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor + + +@dataclass +class PipelineTimeSlice: + start: str = "" + end: str = "" + slice_type: str = "" + bp_timeslice: list = None + + def __post_init__(self): + self.bp_timeslice = self.bp_timeslice or [] + + +class PipelineTraceViewer: + STAGE_COLOR = "good" + BUBBLE_COLOR = "generic_work" + FP_COLOR = "good" + BP_COLOR = "bad" + PIPLINE_VIEW = "Pipeline View" + STAGE = "Stage" + BUBBLE = "Bubble" + FP = "FP" + BP = "BP" + + COLORS = { + STAGE: STAGE_COLOR, + BUBBLE: BUBBLE_COLOR, + FP: FP_COLOR, + BP: BP_COLOR + } + + def _gen_trace_pair(self, name: str, start_ts: str, end_ts: str, pid: str, tid: str) -> Dict: + data = { + Constant.OP_NAME: name, + Constant.CNAME: self.COLORS.get(name, self.BUBBLE), + Constant.PH: Constant.PH_X, + Constant.PID: pid, + Constant.OP_TID: tid, + Constant.TS: start_ts, + Constant.DUR: str(Decimal(end_ts) - Decimal(start_ts)) + } + + return data + + def gen_stage_bubble_trace_data(self, rank_id: int, timeslice_list: List[PipelineTimeSlice]) -> List[Dict]: + """ + generate stage bubble trace json data + """ + rank_str = f'Rank {rank_id}' + trace_data = [] + + for timeslice in timeslice_list: + data = self._gen_trace_pair(timeslice.slice_type, timeslice.start, + timeslice.end, self.PIPLINE_VIEW, rank_str) + trace_data.append(data) + + return trace_data + + def gen_fp_bp_trace_data(self, rank_id: int, timeslice_list: List[PipelineTimeSlice]) -> List[Dict]: + """ + generate fp bp trace json data + """ + rank_str = f'Rank {rank_id}' + trace_data = [] + + for timeslice in timeslice_list: + if timeslice.slice_type == self.BUBBLE: + data = self._gen_trace_pair(timeslice.slice_type, timeslice.start, + timeslice.end, self.PIPLINE_VIEW, rank_str) + trace_data.append(data) + else: + last_end = timeslice.start + for bp_bound in timeslice.bp_timeslice: + data = self._gen_trace_pair(self.FP, last_end, + bp_bound[0], self.PIPLINE_VIEW, rank_str) + trace_data.append(data) + last_end = bp_bound[1] + + data = self._gen_trace_pair(self.BP, bp_bound[0], + bp_bound[1], self.PIPLINE_VIEW, rank_str) + trace_data.append(data) + + last_data = self._gen_trace_pair(self.FP, last_end, + timeslice.end, self.PIPLINE_VIEW, rank_str) + trace_data.append(last_data) + + return trace_data + + +class ClusterPipelineAdvice(ClusterAdviceBase): + BUBBLE = "Bubble" + STAGE = "Stage" + PIPELINE_VIEW = "Pipeline View" + SAVE_JSON = "pipeline_view.json" + + def __init__(self, collection_path: str, kwargs: dict): + super().__init__(collection_path) + self.rank_ids = list(set(kwargs.get("rank_ids", []))) + self.worker_num = kwargs.get("worker_num", int(multiprocessing.cpu_count() / 2)) + self.rank_prof_dirs = {} + self.cur_data = [] + self.cur_bottleneck = {} + self.cur_advices = "" + + def run(self) -> dict: + """ + Unified entrance interface + """ + self.rank_prof_dirs = self.get_rank_prof_dirs(self.rank_ids) + if not self.rank_prof_dirs: + print("[ERROR] No rank profiling data found, please check the rank ids or dir path.") + return {} + + self.process() + self.output() + self.identify_bottleneck() + return self.output_format_data + + def process(self) -> None: + """ + process all rank profiling data by using multi-process + """ + start_time = time.time() + print(f"[INFO] Start to process {len(self.rank_prof_dirs)} rank profiling data with {self.worker_num} workers.") + with multiprocessing.Pool(self.worker_num) as pool: + results = pool.map(self.work, self.rank_prof_dirs.items()) + + for (rank_id, _), (res, show_fp_bp) in zip(self.rank_prof_dirs.items(), results): + if show_fp_bp: + self.cur_data += PipelineTraceViewer().gen_fp_bp_trace_data(rank_id, res) + else: + self.cur_data += PipelineTraceViewer().gen_stage_bubble_trace_data(rank_id, res) + print(f"[INFO] Pipline view data process finished, cost {time.time() - start_time:.2f}s.") + + @staticmethod + def _align_trace_bound(results: List) -> None: + """ + align all rank trace bound for better visualization + """ + start_list, end_list = [], [] + for res in results: + start_list.append(res[0].start) + end_list.append(res[-1].end) + + # update all rank trace bound + for res in results: + res[0].start = min(start_list) + res[-1].end = max(end_list) + + def work(self, kv: Tuple[int, str]) -> Tuple[List[PipelineTimeSlice], bool]: + """ + single process worker function + """ + show_fp_bp = False + rank_id, rank_prof_dir = kv + print(f"[INFO] [Rank {rank_id}] Start to process rank profiling data.") + json_path = os.path.join(rank_prof_dir, Constant.ASCEND_PROFILER_OUTPUT, Constant.TRACE_VIEW_JSON) + fine_data = self.load_trace_view_data(json_path) + if not fine_data.hcom_ops or not fine_data.hcom_tids: + print(f"[ERROR] [Rank {rank_id}] No hcom send recv ops found, make sure the trace view data is pipeline " + f"parallel sense.") + return [], show_fp_bp + + timeslice_list = self.get_pipeline_timeslice(fine_data.hcom_ops, fine_data.hcom_tids, fine_data.min_ts, + fine_data.max_ts) + if not fine_data.fp_ops or not fine_data.bp_ops: + print(f"[INFO] [Rank {rank_id}] No frameWork data in trace view, only show stage and bubble.") + elif len(fine_data.hcom_tids) > 1: + print(f"[WARN] [Rank {rank_id}] More than one hcom tid found, only show stage and bubble.") + else: + print(f"[INFO] [Rank {rank_id}] Found frameWork data in trace view, show fp bp and bubble.") + bp_ops = self.get_fp_bp_bound_ops(fine_data) + self.update_stage_fp_bp(timeslice_list, bp_ops) + show_fp_bp = True + print(f"[INFO] [Rank {rank_id}] Rank profiling data process finished.") + + return timeslice_list, show_fp_bp + + def identify_bottleneck(self) -> None: + pass + + def output(self) -> None: + """ + output result + """ + self.cur_data.append( + { + Constant.OP_NAME: Constant.PROCESS_NAME, + Constant.PH: Constant.PH_META, + Constant.PID: self.PIPELINE_VIEW, + Constant.OP_TID: self.PIPELINE_VIEW, + Constant.ARGS: { + Constant.OP_NAME: self.PIPELINE_VIEW + } + } + ) + self.output_format_data[self.DATA] = self.cur_data + self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck + self.output_format_data[self.ADVICE] = self.cur_advices + + def get_rank_prof_dirs(self, rank_ids: list) -> Dict[int, str]: + """ + get rank profiling directories by rank ids + """ + rank_prof_dirs = defaultdict(str) + prof_dirs = [] + for prof_dir in os.listdir(self.collection_path): + if prof_dir.endswith(Constant.PT_PROF_SUFFIX): + prof_dirs.append(os.path.join(self.collection_path, prof_dir)) + + data_map = PytorchDataPreprocessor(prof_dirs).get_data_map() + for rank_id in rank_ids: + if rank_id in data_map: + rank_prof_dirs[rank_id] = data_map[rank_id] + else: + print(f'[Warning] Rank {rank_id} not found in {self.collection_path}') + + return rank_prof_dirs + + @staticmethod + def load_trace_view_data(json_path) -> Optional[FineTraceViewData]: + """ + load trace view data from json file and preprocess + """ + raw_data = FileManager.read_json_file(json_path) + return TraceViewPreProcessor().process(raw_data) + + @staticmethod + def double_queue_pop(fp_que: Deque[dict], bp_que: Deque[dict]) -> Tuple[list, list]: + """ + double queue (fp and bp que) pop alternating algorithm implementation + """ + res_fp_ops, res_bp_ops = [], [] + pop_fp = fp_que[0][Constant.TS] < bp_que[0][Constant.TS] + fp_start_op, fp_end_op = fp_que[0], fp_que[0] + bp_start_op, bp_end_op = bp_que[0], bp_que[0] + + def update_bound_op(que: Deque[dict], start_op: dict, end_op: dict) -> Tuple[dict, dict]: + """ + update fp and bp bound op + """ + op = que.popleft() + op_s = Decimal(op[Constant.TS]) + op_e = op_s + Decimal(op[Constant.DUR]) + + start_op = op if op_s < Decimal(start_op[Constant.TS]) else start_op + end_op = op if op_e > Decimal(end_op[Constant.TS]) + Decimal(end_op[Constant.DUR]) else end_op + + return start_op, end_op + + while fp_que and bp_que: + if pop_fp: + if len(fp_que) > 1 and bp_que and fp_que[1][Constant.TS] > bp_que[0][Constant.TS]: + pop_fp = False # pop bp que + if len(fp_que) == 1: + pop_fp = False # pop bp que + + fp_start_op, fp_end_op = update_bound_op(fp_que, fp_start_op, fp_end_op) + + # time to pop bp que, need to record fp ops and update bp start op + if not pop_fp: + res_fp_ops.append((fp_start_op, fp_end_op)) + if fp_que: + bp_start_op, bp_end_op = bp_que[0], bp_que[0] + else: + if len(bp_que) > 1 and fp_que and bp_que[1][Constant.TS] > fp_que[0][Constant.TS]: + pop_fp = True # pop fp que + if len(bp_que) == 1: + pop_fp = True # pop fp que + + bp_start_op, bp_end_op = update_bound_op(bp_que, bp_start_op, bp_end_op) + + # time to pop fp que, need to record bp ops and update fp start op + if pop_fp: + res_bp_ops.append((bp_start_op, bp_end_op)) + if bp_que: + fp_start_op, fp_end_op = fp_que[0], fp_que[0] + + if fp_que: + fp_start_op, fp_end_op = fp_que[0], fp_que[0] + while fp_que: + fp_start_op, fp_end_op = update_bound_op(fp_que, fp_start_op, fp_end_op) + res_fp_ops.append((fp_start_op, fp_end_op)) + + if bp_que: + bp_start_op, bp_end_op = bp_que[0], bp_que[0] + while bp_que: + bp_start_op, bp_end_op = update_bound_op(bp_que, bp_start_op, bp_end_op) + res_bp_ops.append((bp_start_op, bp_end_op)) + + return res_fp_ops, res_bp_ops + + @staticmethod + def update_ops_time(ops_list: List[List[dict]], torch_to_npu_links: List[dict], + npu_ops_ts_dur: dict) -> List[List[dict]]: + """ + update fp and bp bound ops time at device by using torch_to_npu_links + """ + ops_que = deque(ops_list) + torch_to_npu_que = deque(torch_to_npu_links) + res = [] + link_stack = [] + while ops_que and torch_to_npu_que: + link = torch_to_npu_que.popleft() + link_s = Decimal(link[Constant.TS]) + + # bound op at framework level + cpu_op_l, cpu_op_r = ops_que[0][0], ops_que[0][1] + cpu_op_s = Decimal(cpu_op_l[Constant.TS]) + cpu_op_e = Decimal(cpu_op_r[Constant.TS]) + Decimal(cpu_op_r[Constant.DUR]) + + if cpu_op_s < link_s < cpu_op_e: + link_stack.append(link) + if link_s > cpu_op_e or \ + (link_stack and not torch_to_npu_que): + min_link = link_stack[0] + max_link = link_stack[-1] + + min_link_s = str(min_link[Constant.ID]) + max_link_s = str(max_link[Constant.ID]) + # for compatibility with old data (ts is float type) + if isinstance(min_link[Constant.ID], float): + cpu_op_l["npu_op_ts"] = min_link_s + cpu_op_r["npu_op_ts"] = max_link_s + else: + cpu_op_l["npu_op_ts"] = f"{min_link_s[:-3]}.{min_link_s[-3:]}" + cpu_op_r["npu_op_ts"] = f"{max_link_s[:-3]}.{max_link_s[-3:]}" + cpu_op_l["npu_op_dur"] = npu_ops_ts_dur.get(cpu_op_l["npu_op_ts"], 0) + cpu_op_r["npu_op_dur"] = npu_ops_ts_dur.get(cpu_op_r["npu_op_ts"], 0) + + res.append([cpu_op_l, cpu_op_r]) + ops_que.popleft() + link_stack.clear() + + return res + + def get_fp_bp_bound_ops(self, fine_data: FineTraceViewData) -> List[List[dict]]: + """ + get fp and bp bound ops by using double queue alternating pop algorithm and + update fp and bp bound ops time at device by using torch_to_npu_links + """ + fp_que = deque(fine_data.fp_ops) + bp_que = deque(fine_data.bp_ops) + + # get fp and bp bound ops + _, res_bp_ops = self.double_queue_pop(fp_que, bp_que) + + # according to torch_to_npu_links, split fp and bp timeslice + bp_ops = self.update_ops_time(res_bp_ops, fine_data.torch_to_npu_links, fine_data.npu_ops_ts_dur) + return bp_ops + + def get_pipeline_timeslice(self, hcom_ops: list, hcom_tids: list, + min_ts: str, max_ts: str) -> List[PipelineTimeSlice]: + """ + get pipeline timeslice by using hcom ops + """ + timeslice_list = [] + last_op_end = None + if len(hcom_tids) > 1: + print("[WARN] More than one hcom tid found, default to show minimal tid pipeline view.") + + for op in hcom_ops: + if op[Constant.OP_TID] == min(hcom_tids): + # gap between two hcom ops + if last_op_end: + timeslice_list.append(PipelineTimeSlice(str(last_op_end), op[Constant.TS], self.STAGE)) + # hcom op + last_op_end = Decimal(op[Constant.TS]) + Decimal(op[Constant.DUR]) + timeslice_list.append(PipelineTimeSlice(op[Constant.TS], str(last_op_end), self.BUBBLE)) + + # add start STAGE and end STAGE + timeslice_list.insert(0, PipelineTimeSlice(min_ts, timeslice_list[0].start, self.STAGE)) + timeslice_list.insert(len(timeslice_list), PipelineTimeSlice(timeslice_list[-1].end, max_ts, self.STAGE)) + return timeslice_list + + def update_stage_fp_bp(self, timeslice_list: List[PipelineTimeSlice], + bp_ops: List[List[dict]]) -> None: + """ + update stage fp and bp time + """ + pipeline_que = deque(timeslice_list) + bp_bound_que = deque(bp_ops) + + while pipeline_que and bp_bound_que: + while pipeline_que[0].slice_type != self.STAGE: + pipeline_que.popleft() + if not pipeline_que: + return None + + bp_bound_data = bp_bound_que[0] + bp_bound_s = Decimal(bp_bound_data[0]['npu_op_ts']) + bp_bound_e = Decimal(bp_bound_data[1]['npu_op_ts']) + Decimal(bp_bound_data[1]['npu_op_dur']) + + pipeline_s = Decimal(pipeline_que[0].start) + pipeline_e = Decimal(pipeline_que[0].end) + + if pipeline_s <= bp_bound_s and bp_bound_e <= pipeline_e: + pipeline_que[0].bp_timeslice.append((str(bp_bound_s), str(bp_bound_e))) + bp_bound_que.popleft() + elif bp_bound_s > pipeline_e: + pipeline_que.popleft() + else: + bp_bound_que.popleft() diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..6fa83c765f5fe1f4ac20dcc62895fe0450e338ce --- /dev/null +++ b/profiler/advisor_review/advisor_backend/cluster_advice/kernel_cluster_advice.py @@ -0,0 +1,62 @@ +import os +import pandas as pd +from common_func.path_manager import PathManager +from common_func.constant import Constant +from common_func_advisor.constant import Constant as AdvisorConstant +from cluster_advice.cluster_advice_base import ClusterAdviceBase +from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor + + +class KernelClusterAdvice(ClusterAdviceBase): + COLUMNS_TO_GROUP = ["Name", "Input Shapes", "Input Data Types", "Output Shapes"] + COLUMNS_TO_CAL = ["Duration(us)"] + CAL_FUN = ['mean', 'var', 'max', 'min', 'count', 'sum'] + + def __init__(self, collection_path: str, kwargs: dict = None): + super().__init__(collection_path) + self.all_kernel_data = pd.DataFrame() + + def run(self): + self.load_kernel_details_data() + return self.calculate_data() + + def load_kernel_details_data(self): + prof_dirs = self.get_prof_dirs(self.collection_path) + if not prof_dirs: + msg = "[ERROR] There is no profile in this collection path, terminate analysis." + raise RuntimeError(msg) + + data_map = PytorchDataPreprocessor(prof_dirs).get_data_map() + self.all_kernel_data = pd.DataFrame() + for rank_id, profiling_dir_path in data_map.items(): + kernel_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.KERNEL_DETAILS_CSV) + if kernel_file: + # 判断csv文件大小 + PathManager.check_path_readable(kernel_file) + # 读取CSV文件 + df_temp = pd.read_csv(kernel_file) + columns_to_keep = self.COLUMNS_TO_GROUP + self.COLUMNS_TO_CAL + if [1 for element in columns_to_keep if element not in list(df_temp)]: + msg = "[ERROR] Kernel details.csv has wrong data columns, terminate analysis." + raise RuntimeError(msg) + df = df_temp[columns_to_keep] + df.insert(loc=0, column='rank id', value=rank_id) + # 将数据添加到最终的数据框中 + self.all_kernel_data = pd.concat([self.all_kernel_data, df], ignore_index=True) + + def calculate_data(self): + # 存储所有合并后的数据 + calculate_dict = {self.COLUMNS_TO_CAL[i]: self.CAL_FUN + for i in range(len(self.COLUMNS_TO_CAL))} + group_col = ["rank id"] + self.COLUMNS_TO_GROUP + view_data = self.all_kernel_data.groupby(group_col).agg(calculate_dict).reset_index() + view_data.columns = [''.join(col) if col[1] == "" else '_'.join(col) for col in view_data.columns] + return view_data + + def get_prof_dirs(self, collection_path): + prof_dirs = [] + for prof_dir in os.listdir(collection_path): + if prof_dir.endswith(AdvisorConstant.PT_PROF_SUFFIX): + prof_dirs.append(os.path.join(collection_path, prof_dir)) + + return prof_dirs \ No newline at end of file diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..f8a625242f3939602cbb7b8391cd8062e21fe01b --- /dev/null +++ b/profiler/advisor_review/advisor_backend/cluster_advice/slow_link_advice.py @@ -0,0 +1,110 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import defaultdict +from common_func_advisor.constant import Constant +from common_func.file_manager import FileManager +from cluster_advice.cluster_advice_base import ClusterAdviceBase + + +class SlowLinkAdvice(ClusterAdviceBase): + RDMA_TIME_MS = "RDMA time(ms)" + RDMA_SIZE_MB = "RDMA size(mb)" + SDMA_TIME_MS = "SDMA time(ms)" + SDMA_SIZE_MB = "SDMA size(mb)" + RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)" + SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)" + COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info" + TRANSIT_TIME = "Transit Time(ms)" + TRANSIT_SIZE = "Transit Size(MB)" + SDMA = "SDMA" + RDMA = "RDMA" + + def __init__(self, collection_path: str, kwargs: dict = None): + super().__init__(collection_path) + self.rank_bw_dict = defaultdict(lambda: { + self.RDMA_TIME_MS: 0, + self.RDMA_SIZE_MB: 0, + self.SDMA_TIME_MS: 0, + self.SDMA_SIZE_MB: 0, + }) + + @staticmethod + def compute_ratio(dividend: float, divisor: float): + if abs(divisor) < 1e-15: + return 0 + else: + return round(dividend / divisor, 4) + + def load_communication_json(self): + json_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT, Constant.CLUSTER_COMM_JSON) + if not os.path.exists(json_path): + msg = "[ERROR] cluster_communication.json doesn't exist, terminate analysis." + raise RuntimeError(msg) + communication_json = FileManager.read_json_file(json_path) + return communication_json + + def run(self): + self.path_check() + communication_json = self.load_communication_json() + self.process(communication_json) + self.output() + return self.output_format_data + + def process(self, communication_json: dict): + for comm_group, group_dict in communication_json.items(): + for step, step_dict in group_dict.items(): + for op, op_dict in step_dict.items(): + self.compute_bandwidth(op_dict) + if self.rank_bw_dict: + self.produce_bottleneck(self.RDMA_BANDWIDTH) + self.produce_bottleneck(self.SDMA_BANDWIDTH) + + def compute_bandwidth(self, op_dict: dict): + for rank_id, rank_dict in op_dict.items(): + try: + rank = int(rank_id) + except ValueError as e: + msg = "[ERROR] Cluster_communication.json has invalid structure." + raise ValueError(msg) from e + for comm_type, bw_dict in rank_dict.get(self.COMMUNICATION_BANDWIDTH_INFO, {}).items(): + if comm_type == self.SDMA: + self.rank_bw_dict[rank][self.SDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) + self.rank_bw_dict[rank][self.SDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) + if comm_type == self.RDMA: + self.rank_bw_dict[rank][self.RDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) + self.rank_bw_dict[rank][self.RDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) + + for rank, rank_dict in self.rank_bw_dict.items(): + self.rank_bw_dict[rank][self.RDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[rank][self.RDMA_SIZE_MB], self.rank_bw_dict[rank][self.RDMA_TIME_MS]) + self.rank_bw_dict[rank][self.SDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[rank][self.SDMA_SIZE_MB], self.rank_bw_dict[rank][self.SDMA_TIME_MS]) + + def produce_bottleneck(self, link_type: str): + data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()] + avg_bw = round(sum(data_list) / len(data_list), 3) + if avg_bw == 0: + return + self.bottelneck += f'{link_type}: \n' \ + f'The average is {avg_bw}, ' \ + f'while the maximum is {round(max(data_list), 3)}GB/s and ' \ + f'the minimum is {round(min(data_list), 3)}GB/s. ' \ + f'the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n' + + def output(self): + self.output_format_data[self.DATA] = self.rank_bw_dict + self.output_format_data[self.BOTTLENECK] = self.bottelneck diff --git a/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py b/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..4e789fb7fb688626df7e8f5b25b84e4955d6c2a3 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/cluster_advice/slow_rank_advice.py @@ -0,0 +1,71 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import defaultdict +from common_func_advisor.constant import Constant +from common_func.file_manager import FileManager +from cluster_advice.cluster_advice_base import ClusterAdviceBase +from prof_bean_advisor.cluster_step_trace_time_bean import ClusterStepTraceTimeBean + + +class SlowRankAdvice(ClusterAdviceBase): + RANK = "rank" + RATIO_THRESHOLD = 0.05 + BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] + + def __init__(self, collection_path: str, kwargs: dict = None): + super().__init__(collection_path) + + def load_step_time(self): + csv_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT, Constant.CLUSTER_STEP_TIME_CSV) + if not os.path.exists(csv_path): + msg = "[ERROR] cluster_step_trace_time.csv doesn't exist, terminate analysis." + raise RuntimeError(msg) + step_time = FileManager.read_csv_file(csv_path, ClusterStepTraceTimeBean) + return step_time + + def run(self): + self.path_check() + step_data = self.load_step_time() + step_dict = self.process(step_data) + self.output(step_dict) + return self.output_format_data + + def process(self, step_data: list): + step_dict = defaultdict(lambda: [0, 0, 0, 0]) + for step_bean in step_data: + if step_bean.type == self.RANK: + step_dict[step_bean.index][0] += step_bean.compute + step_dict[step_bean.index][1] += step_bean.communication + step_dict[step_bean.index][2] += step_bean.free + total_time_list = [sum(data_tuple) for rank_id, data_tuple in step_dict.items()] + if total_time_list: + mean_total_time = sum(total_time_list) / len(total_time_list) + for i in range(len(self.BOTTLENECK_LIST)): + self.produce_bottleneck(step_dict, i, mean_total_time) + return step_dict + + def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float): + data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()] + max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time) + if max_ratio > self.RATIO_THRESHOLD: + self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} has some issues in the cluster, ' \ + f'because the max difference of {self.BOTTLENECK_LIST[produce_type]} time ' \ + f'has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n' + + def output(self, step_dict: dict): + self.output_format_data[self.DATA] = step_dict + self.output_format_data[self.BOTTLENECK] = self.bottelneck diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py b/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor_review/advisor_backend/common_func_advisor/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py b/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..46a7fb24c2dade75c157f18118f29233eb924b88 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/common_func_advisor/constant.py @@ -0,0 +1,225 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from enum import Enum + + +class CsvTitle: + MODEL_NAME = "Model Name" + MODEL_ID = "Model ID" + TASK_ID = "Task ID" + STREAM_ID = "Stream ID" + INFER_ID = "Infer ID" + TASK_START_TIME = "Task Start Time(us)" + TASK_WAIT_TIME = "Task Wait Time(us)" + BLOCK_DIM = "Block Dim" + MIX_BLOCK_DIM = "Mix Block Dim" + HF32_ELIGIBLE = "HF32 Eligible" + INPUT_SHAPES = "Input Shapes" + INPUT_DATA_TYPES = "Input Data Types" + INPUT_FORMATS = "Input Formats" + OUTPUT_SHAPES = "Output Shapes" + OUTPUT_DATA_TYPES = "Output Data Types" + OUTPUT_FORMATS = "Output Formats" + CONTEXT_ID = "Context ID" + AICORE_TIME = "aicore_time(us)" + AIC_TOTAL_CYCLES = "aic_total_cycles" + AIC_MAC_TIME = "aic_mac_time(us)" + AIC_MAC_RATIO = "aic_mac_ratio" + AIC_SCALAR_TIME = "aic_scalar_time(us)" + AIC_SCALAR_RATIO = "aic_scalar_ratio" + AIC_MTE1_TIME = "aic_mte1_time(us)" + AIC_MTE1_RATIO = "aic_mte1_ratio" + AIC_MTE2_TIME = "aic_mte2_time(us)" + AIC_MTE2_RATIO = "aic_mte2_ratio" + AIC_FIXPIPE_TIME = "aic_fixpipe_time(us)" + AIC_FIXPIPE_RATIO = "aic_fixpipe_ratio" + AIC_ICACHE_MISS_RATE = "aic_icache_miss_rate" + AIV_TIME = "aiv_time(us)" + AIV_TOTAL_CYCLES = "aiv_total_cycles" + AIV_VEC_TIME = "aiv_vec_time(us)" + AIV_VEC_RATIO = "aiv_vec_ratio" + AIV_SCALAR_TIME = "aiv_scalar_time(us)" + AIV_SCALAR_RATIO = "aiv_scalar_ratio" + AIV_MTE2_TIME = "aiv_mte2_time(us)" + AIV_MTE2_RATIO = "aiv_mte2_ratio" + AIV_MTE3_TIME = "aiv_mte3_time(us)" + AIV_MTE3_RATIO = "aiv_mte3_ratio" + AIV_ICACHE_MISS_RATE = "aiv_icache_miss_rate" + CUBE_UTILIZATION = "cube_utilization( %)" + TASK_DURATION_SUM = "Task Duration Sum(us)" + TASK_DURATION_MEAN = "Task Duration Mean(us)" + TASK_DURATION_STD = "Task Duration Std(us)" + TASK_DURATION_RATIO = "Task Duration Ratio(100%)" + SIZE = "size(MB)" + THROUGHPUT = "throughput(GB/s)" + COLOR = "color" + GAP = "Gap(us)" + DURATION_SUM = "Duration Sum(us)" + COUNT = "Count" + MAX_DURATION = "Max Duration(us)" + MIN_DURATION = "Min Duration(us)" + AVG_DURATION = "Avg Duration(us)" + DURATION_RATIO = "Duration Ratio" + INDEX = "Index" + + +# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配旧版csv +class CsvTitleV1(CsvTitle): + OP_NAME = "Op Name" + OP_TYPE = "OP Type" + TASK_TYPE = "Task Type" + TASK_DURATION = "Task Duration(us)" + + +# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配新版csv +class CsvTitleV2(CsvTitle): + OP_NAME = "Name" + OP_TYPE = "Type" + TASK_TYPE = "Accelerator Core" + TASK_DURATION = "Duration(us)" + + +class Constant: + DTYPE_SIZE_MAP = {"int8": 1, "uint8": 1, + "int16": 2, "uint16": 2, + "int32": 4, "uint32": 4, + "int64": 8, "uint64": 8, + "float16": 2, + "bfloat16": 2, + "bf16": 2, + "dt_bf16": 2, + "float32": 4, + "float": 4, + "float64": 8, + "complex64": 8, + "complex128": 16, + "bool": 1} + TP_THRESHOLD = 1150 + MAX_INPUT_MODE_LEN = 30 + MAX_INPUT_ADVICE_LEN = 30 + SMALL_OP_DUR_RATIO = 0.2 + SMALL_OP_NUM_RATIO = 0.2 + BYTE_UNIT_TRANS = 1024 + UNIT_TRANS = 1000 + + # mode list + COMPUTE = "compute" + TIMELINE = "timeline" + CLUSTER = "cluster" + OVERALL = "overall" + PIPELINE = "pipeline" + + # advice list + SLOW_RANK = "slow rank" + SLOW_LINK = "slow link" + KERNEL = "kernel" + + # compute + NPU_FUSED = "npu_fused" + NPU_SLOW = "npu_slow" + + # timeline + OPTIM = "optimizer" + OP_SCHE = "op_schedule" + + # overall + SUMMARY = "summary" + + PT_PROF_SUFFIX = "ascend_pt" + ASCEND_PROFILER_OUTPUT = "ASCEND_PROFILER_OUTPUT" + COLLECTION_PATH = "collection_path" + CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output" + KERNEL_DETAILS_CSV = "kernel_details.csv" + CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv" + CLUSTER_COMM_JSON = "cluster_communication.json" + + # pipline + OP_NAME = "name" + OP_TID = "tid" + PID = "pid" + TS = "ts" + DUR = "dur" + CAT = "cat" + ARGS = "args" + PH = "ph" + ID = "id" + PH_START = "s" + PH_BEGIN = "B" + PH_END = "E" + PH_META = "M" + PH_X = "X" + CNAME = "cname" + PROCESS_NAME = "process_name" + FRAMEWORK_NAME = "Python" + ASCEND_HARDWARE_NAME = "Ascend Hardware" + ASYNC_NPU = "async_npu" + STEP_PREFIX = "ProfilerStep#" + FP_ATEN_OP = "aten" + FP_C10D_OP = "c10d" + HCOM_OP_PREFIX = "hcom_" + BP_AUTOGRAD_OP = "autograd" + TRACE_VIEW_JSON = "trace_view.json" + + # pattern_dict key: pattern, value: pattern name + PATTERN_DICT = {("Add", "DropOutDoMask", "Add"): "bias_dropout_add", + ("BatchMatMul", "Mul", "Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast", "DropOutDoMask", + "AsStrided", "BatchMatMul", "Transpose"): "FA", + ("Transpose", "Transpose", "Transpose", "Mul", "Transpose", "BatchMatMulV2", "MaskedFill", + "Cast", "SoftmaxV2", "Cast", "DropOutDoMask", "BatchMatMulV2", "Transpose"): "FA", + ("Transpose", "BatchMatMulV2", "Transpose", "Transpose", "BatchMatMulV2", "ZerosLike", + "DropOutDoMask", "Cast", "SoftmaxGrad", "Cast", "MaskedFill", "BatchMatMulV2", + "BatchMatMulV2", "Mul"): "FA", + ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Cast", "Cast", "Mul", "Cast", "Cast", + "Mul", "Cast"): "RMSNORM", + ("Cast", "LayerNorm", "Cast"): "LayerNorm", + ("Add", "LayerNorm"): "AddLayerNorm", + ("Add", "LayerNormV3"): "AddLayerNorm", + ("Gelu", "Add"): "GeluAdd", + ("Cast", "Square", "MemSet", "ReduceMean", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "RMSNorm", + ("BatchMatMul", "RealDiv", "Add", "Maximum", "SoftmaxV2", "Cast", "BatchMatMul"): "FA", + ("BatchMatMulV2", "RealDiv", "Add", "Cast", "Maximum", "Cast", "SoftmaxV2", "AsStrided", + "BatchMatMulV2"): "FA", + ("BatchMatMulV2", "RealDiv", "Add", "Cast", "SoftmaxV2", "Cast", "BroadcastTo", + "BatchMatMulV2"): "FA", + ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Cast", "Mul", "Add"): "RotaryMul", + ("Mul", "AsStrided", "Neg", "AsStrided", "ConcatD", "Mul", "Add"): "RotaryMul", + ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul", "Add"): "RotaryMul", + ("MatMulV2", "Swish", "MatMulV2", "Mul", "MatMulV2"): "FFN", + ("Transpose", "Transpose", "GatherElement", "Transpose"): "GatherElement", + ("Slice", "Slice", "Swish", "Mul"): "torch_npu.npu_swiglu", + ("Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast"): "torch_npu.npu_scaled_masked_softmax", + ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul"): "torch_npu.npu_rotary_mul", + ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "torch_npu.npu_rms_norm"} + TITLE = CsvTitleV2 + + @classmethod + def update_title(cls): + cls.TITLE = CsvTitleV1 + + +class CoreType: + AIV = "AI_VECTOR_CORE" + AIC = "AI_CORE" + AICPU = "AI_CPU" + MIX_AIV = "MIX_AIV" + MIX_AIC = "MIX_AIC" + HCCL = "HCCL" + + +class PerfColor(Enum): + WHITE = 0 + GREEN = 1 + YELLOW = 2 + RED = 3 diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py new file mode 100644 index 0000000000000000000000000000000000000000..8171f06ee235fc02da715044b4d310087c36c102 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_json.py @@ -0,0 +1,209 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from abc import abstractmethod +from dataclasses import dataclass +from dataclasses import field +from typing import Dict +from typing import List + +import pandas as pd + +from common_func.file_manager import FileManager + + +@dataclass +class TraceObj: + ph: str = "" + bp: str = "" + cat: str = "" + name: str = "" + pid: int = 0 + tid: int = 0 + id: int = 0 + ts: str = "" + dur: float = 0.0 + args: dict = field(default='unknown') + + @abstractmethod + def hash(self): + raise Exception("To be implemented") + + def valid(self): + return self.name != "" + + def check_hashable(self): + if not self.valid(): + raise Exception("Illegal {} to hash".format(self.__class__.name)) + + +@dataclass +class Process(TraceObj): + def hash(self): + self.check_hashable() + # msprof 保证name唯一性 + return self.args.get("name") + + +@dataclass +class Thread(TraceObj): + def hash(self): + self.check_hashable() + # msprof 保证name唯一性 + return self.args.get("name") + + +@dataclass +class DurationEvent(TraceObj): + def hash(self): + self.check_hashable() + return self.ts + + +@dataclass +class FlowEvent(TraceObj): + s_point_ts: str = "" + e_point_ts: str = "" + + def hash(self): + self.check_hashable() + return self.e_point_ts + + +class TraceViewJson: + + def __init__(self, path): + self.processes: Dict[str, Process] = dict() + self.threads: Dict[str, Thread] = dict() + self.python_dur_events: Dict[str, DurationEvent] = dict() + self.cann_dur_events: Dict[str, DurationEvent] = dict() + self.ascend_hardware_dur_events: Dict[str, DurationEvent] = dict() + self.torch_2_npu_flow_events: Dict[str, FlowEvent] = dict() + traces = FileManager.read_json_file(path) + self._load_obj(traces) + + def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str: + if ts_col not in data.columns.tolist(): + print("[ERROR] No {} col found in data columns.".format(ts_col)) + return "" + row = data.loc[index_id] + timestamp = row[ts_col] + flow_event = self.get_torch_2_npu_flow_event(timestamp) + if not flow_event.valid(): + print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern'])) + return "" + flow_event_s_key = flow_event.s_point_ts + python_dur_events = self.get_python_dur_events_contain_ts(flow_event_s_key) + if not python_dur_events: + print("[ERROR] No python dur event found for pattern {}.".format(row['pattern'])) + return "" + # 保持新老版本callstack兼容性 + if python_dur_events[0].args.get("Call stack"): + # 旧版本 + call_stack_list = python_dur_events[0].args.get("Call stack").split(";") + else: + python_dur_events.sort(key=lambda e: e.ts) + # 新版本 + call_stack_list = [event.name for event in python_dur_events if event.cat == "python_function"] + call_stack = "\n".join(call_stack_list) + return call_stack + + def get_torch_2_npu_flow_event(self, end_time) -> FlowEvent: + if not self.torch_2_npu_flow_events or not self.torch_2_npu_flow_events.get(end_time): + print("[ERROR] Find flow event failed for ts: {}".format(end_time)) + return FlowEvent() + return self.torch_2_npu_flow_events.get(end_time) + + def get_python_dur_events_contain_ts(self, ts) -> List[DurationEvent]: + res = [] + for event in self.python_dur_events.values(): + if float(event.ts) <= float(ts) <= float(event.ts) + event.dur: + res.append(event) + return res + + def _load_obj(self, traces): + self._load_format(traces) + if not self._check_format(): + print("[ERROR] parse json failed for error format") + return + self._load_duration_events(traces) + self._load_torch_to_npu_flow_events(traces) + + def _check_format(self): + # 当前功能只需要这两个process,可扩展 + check_processes = ['Python', 'Ascend Hardware'] + for check_process in check_processes: + if check_process in self.processes: + continue + print("[ERROR] {} process not found in json.".format(check_process)) + return False + return True + + # 加载pid, tid头 + def _load_format(self, traces: List[Dict]): + for i, trace in enumerate(traces): + if trace.get('name') == 'process_name': + if not trace.get('args') or not trace.get('args').get('name') or not trace.get('pid'): + continue + process = Process(**trace) + self.processes[process.hash()] = process + if trace.get('name') == 'thread_name': + if not trace.get('args') or not trace.get('args').get('name') or not trace.get('tid'): + continue + thread = Thread(**trace) + self.threads[thread.hash()] = thread + + def _load_duration_events(self, traces: List[Dict]): + def check_events(_trace): + return _trace.get('name') and _trace.get("ts") and _trace.get("dur") + + python_pid = self.processes.get("Python").pid + cann_pid = self.processes.get("CANN").pid + ascend_hardware_pid = self.processes.get("Ascend Hardware").pid + for i, trace in enumerate(traces): + if trace.get('ph') != 'X': + continue + if not check_events(trace): + continue + event = DurationEvent(**trace) + if trace.get('pid') == python_pid: + self.python_dur_events[event.hash()] = event + elif trace.get('pid') == cann_pid: + self.cann_dur_events[event.hash()] = event + elif trace.get("pid") == ascend_hardware_pid: + self.ascend_hardware_dur_events[event.hash()] = event + + def _load_torch_to_npu_flow_events(self, traces: List[Dict]): + def check_events(_trace): + return _trace.get('name') and _trace.get("id") and _trace.get("ts") + + flow_events_table_by_id = dict() + + python_pid = self.processes.get("Python") + for i, trace in enumerate(traces): + if trace.get('ph') != 's' and trace.get('ph') != 'f' and trace.get('pid') != python_pid: + continue + if not check_events(trace): + continue + event = flow_events_table_by_id.get(trace.get("id")) + if not event: + event = FlowEvent(**trace) + if trace.get('ph') == 's': + event.s_point_ts = trace.get('ts') + else: + event.e_point_ts = trace.get('ts') + flow_events_table_by_id[event.id] = event + + self.torch_2_npu_flow_events = {eve.hash(): eve for eve in flow_events_table_by_id.values()} diff --git a/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..7b9baa32d9423a46bf93d563a6fabbbbb652aaf8 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/common_func_advisor/trace_view_preprocessor.py @@ -0,0 +1,208 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import sys +from typing import Optional +from dataclasses import dataclass + +from common_func_advisor.constant import Constant + + +@dataclass +class FineTraceViewData: + py_pid: int = -1 + fp_tid: int = -1 + bp_tid: int = -1 + ascend_pid: int = -1 + min_ts: str = str(sys.maxsize) + max_ts: str = "0" + hcom_tids: list = None + fp_ops: list = None + bp_ops: list = None + hcom_ops: list = None + npu_ops_ts_dur: dict = None + torch_to_npu_links: list = None + + def __post_init__(self): + self.hcom_tids = self.hcom_tids or [] + self.fp_ops = self.fp_ops or [] + self.bp_ops = self.bp_ops or [] + self.hcom_ops = self.hcom_ops or [] + self.npu_ops_ts_dur = self.npu_ops_ts_dur or {} + self.torch_to_npu_links = self.torch_to_npu_links or [] + + def sort(self): + self.fp_ops.sort(key=lambda x: x[Constant.TS]) + self.bp_ops.sort(key=lambda x: x[Constant.TS]) + self.hcom_ops.sort(key=lambda x: x[Constant.TS]) + self.torch_to_npu_links.sort(key=lambda x: x[Constant.TS]) + + +class TraceViewPreProcessor: + """ + Trace view data preprocess + """ + + @staticmethod + def _is_fp_op(op_name: str) -> bool: + """ + check whether op is fp op + """ + return op_name.startswith(Constant.FP_ATEN_OP) or op_name.startswith(Constant.FP_C10D_OP) + + @staticmethod + def _is_fp_data(data: dict, fp_tid: int, py_pid: int) -> bool: + """ + check whether data is valid fp data + """ + return data[Constant.OP_TID] == fp_tid and \ + Constant.TS in data and Constant.DUR in data and \ + not data[Constant.OP_NAME].startswith(Constant.STEP_PREFIX) and \ + data[Constant.PID] == py_pid + + @staticmethod + def _is_bp_op(op_name: str) -> bool: + """ + check whether op is bp op + """ + return op_name.startswith(Constant.BP_AUTOGRAD_OP) + + @staticmethod + def _is_bp_data(data: dict, bp_tid: int, py_pid: int) -> bool: + """ + check whether data is valid bp data + """ + return data[Constant.OP_TID] == bp_tid and \ + Constant.TS in data and Constant.DUR in data and \ + data[Constant.PID] == py_pid + + @staticmethod + def _is_torch_to_npu_link(data: dict, fp_tid: int) -> bool: + """ + check whether data is torch to npu link + """ + return Constant.CAT in data and data[Constant.CAT] == Constant.ASYNC_NPU and \ + data[Constant.PH] == Constant.PH_START and \ + data[Constant.PID] == fp_tid + + @staticmethod + def _is_send_recv_op(op_name: str) -> bool: + """ + check whether op is hcom send or recv op + """ + # eg: hcom_BatchSendRecv__101_0_1 + p1 = re.compile(r'hcom_\w+SendRecv__\d+') + # eg: hcom_send__101_0_1 + p2 = re.compile(r'hcom_send__\d+') + # eg: hcom_receive__101_0_1 + p3 = re.compile(r'hcom_receive__\d+') + return bool(p1.match(op_name)) or bool(p2.match(op_name)) or bool(p3.match(op_name)) + + @staticmethod + def _is_hcom_op(op_name: str) -> bool: + """ + check whether data is hcom data + """ + return op_name.startswith(Constant.HCOM_OP_PREFIX) + + @staticmethod + def _is_python_process(data: dict) -> bool: + """ + check whether data is python process + """ + return Constant.PH in data and data[Constant.PH] == Constant.PH_META and \ + data[Constant.OP_NAME] == Constant.PROCESS_NAME and \ + data[Constant.ARGS][Constant.OP_NAME] == Constant.FRAMEWORK_NAME + + @staticmethod + def _is_step_op(data: dict) -> bool: + """ + check whether data is step data + """ + return data[Constant.OP_NAME].startswith(Constant.STEP_PREFIX) + + @staticmethod + def _is_ascend_process(data: dict) -> bool: + """ + check whether data is ascend process data + """ + return Constant.PH in data and data[Constant.PH] == Constant.PH_META and \ + data[Constant.OP_NAME] == Constant.PROCESS_NAME and \ + data[Constant.ARGS][Constant.OP_NAME] == Constant.ASCEND_HARDWARE_NAME + + @staticmethod + def _is_npu_op(data: dict, ascend_pid: int) -> bool: + """ + check whether data is npu op + """ + return Constant.PH in data and data[Constant.PH] == Constant.PH_X and \ + not data[Constant.OP_NAME].isupper() and \ + data[Constant.PID] == ascend_pid + + def process(self, raw_data: list) -> Optional[FineTraceViewData]: + """ + preprocess raw data + """ + if not raw_data: + print("[ERROR] No raw data found in trace view data.") + return None + + raw_fp_tids, raw_bp_tids, raw_hcom_tids = set(), set(), set() + fine_data = FineTraceViewData() + + # counting fp ops and bp ops tid and ascend pid + for data in raw_data: + if self._is_fp_op(data[Constant.OP_NAME]): + raw_fp_tids.add(data[Constant.OP_TID]) + elif self._is_bp_op(data[Constant.OP_NAME]): + raw_bp_tids.add(data[Constant.OP_TID]) + elif self._is_send_recv_op(data[Constant.OP_NAME]): + fine_data.hcom_ops.append(data) + raw_hcom_tids.add(data[Constant.OP_TID]) + elif self._is_python_process(data): + fine_data.py_pid = data[Constant.PID] + elif self._is_ascend_process(data): + fine_data.ascend_pid = data[Constant.PID] + + # find max and min ts in hcom ops + if self._is_hcom_op(data[Constant.OP_NAME]): + # for compatibility with old data (ts is float type) + ts = data[Constant.TS] if not isinstance(data[Constant.TS], float) else str(data[Constant.TS]) + fine_data.min_ts = min(fine_data.min_ts, ts) + fine_data.max_ts = max(fine_data.max_ts, ts) + + unique_fp_tid = list(raw_fp_tids - raw_bp_tids) + unique_bp_tid = list(raw_bp_tids) + fine_data.hcom_tids = list(raw_hcom_tids) + + if not unique_fp_tid or not unique_bp_tid: + print("[INFO] No fp or bp tid found in trace view data.") + else: + fine_data.fp_tid, fine_data.bp_tid = unique_fp_tid[0], unique_bp_tid[0] + + # filter fp ops and bp ops and torch_to_npu_links + for data in raw_data: + if self._is_fp_data(data, fine_data.fp_tid, fine_data.py_pid): + fine_data.fp_ops.append(data) + elif self._is_bp_data(data, fine_data.bp_tid, fine_data.py_pid): + fine_data.bp_ops.append(data) + elif self._is_torch_to_npu_link(data, fine_data.fp_tid): + fine_data.torch_to_npu_links.append(data) + elif self._is_npu_op(data, fine_data.ascend_pid): + fine_data.npu_ops_ts_dur[data[Constant.TS]] = data[Constant.DUR] + + fine_data.sort() + return fine_data diff --git a/profiler/advisor_review/advisor_backend/compute_advice/__init__.py b/profiler/advisor_review/advisor_backend/compute_advice/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor_review/advisor_backend/compute_advice/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py b/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py new file mode 100644 index 0000000000000000000000000000000000000000..cafbafd8e28c162bc76edb2f77ebd0645fed552f --- /dev/null +++ b/profiler/advisor_review/advisor_backend/compute_advice/compute_advice_base.py @@ -0,0 +1,105 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +from collections import defaultdict +import os + +from advice_base import AdviceBase +from common_func.file_manager import FileManager + + +class ComputeAdviceBase(AdviceBase): + ASCEND_PT = 'ascend_pt' + ASCEND_PROFILER_OUTPUT = 'ASCEND_PROFILER_OUTPUT' + KERNEL_DETAIL_FILE = "kernel_details.csv" + TRACE_VIEW_FILE = "trace_view.json" + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.kernel_details_path = "" + self.has_preparse = False + self.preparse_data = defaultdict(list) + self.call_stack = None + self.trace_view_path = "" + + def path_check(self): + """ + check whether input path is valid + """ + if not os.path.exists(self.collection_path): + print("[ERROR] Path: {} is not exist.".format(self.collection_path)) + return False + if os.path.isdir(self.collection_path) and self.collection_path.endswith("ascend_pt"): + self.kernel_details_path = os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT", + "kernel_details.csv") + if not os.path.exists(self.kernel_details_path): + print("[ERROR] kernel_details.csv is not exist in the Path: {}.".format( + os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT"))) + return False + elif os.path.isfile(self.collection_path) and os.path.basename(self.collection_path) == "kernel_details.csv": + self.kernel_details_path = self.collection_path + else: + print("[ERROR] Please input ascend_pt or kernel_details.csv") + return False + print("[INFO] Start to analyse the target file: {}".format(self.kernel_details_path)) + self.preparse() + return True + + def has_callstack(self): + if self.call_stack is not None: + return self.call_stack + profiler_info_json_path = "" + for file in os.listdir(self.collection_path): + if file.startswith("profiler_info"): + profiler_info_json_path = os.path.join(self.collection_path, file) + break + if not profiler_info_json_path: + self.call_stack = False + return self.call_stack + self.trace_view_path = os.path.join(self.collection_path, self.ASCEND_PROFILER_OUTPUT, "trace_view.json") + if not os.path.exists(profiler_info_json_path) or not os.path.exists(self.trace_view_path): + self.call_stack = False + return self.call_stack + info = FileManager.read_json_file(profiler_info_json_path) + if not info.get("config") or not info.get("config").get("common_config") \ + or not info.get("config").get("common_config").get("with_stack"): + self.call_stack = False + return self.call_stack + activities = info.get("config").get("common_config").get("activities") + if not activities or "ProfilerActivity.CPU" not in activities: + self.call_stack = False + return self.call_stack + self.call_stack = info.get("config").get("common_config").get("with_stack") + return self.call_stack + + @abstractmethod + def run(self): + """ + analyze profiling data and advice + """ + + @abstractmethod + def output(self): + """ + output relevant data + """ + self.output_format_data[self.DATA] = self.cur_data + self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck + self.output_format_data[self.ADVICE] = self.cur_advice + + def preparse(self): + if self.has_preparse: + return diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..c85c14d618ceda199c9c376abc27a3581eed97b8 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/csv_analyzer.py @@ -0,0 +1,81 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import multiprocessing + +import pandas as pd +import numpy as np + +from common_func_advisor.constant import Constant +from .op_perf import OpPerfFactory + + +class CSVAnalyzer: + def __init__(self, path) -> None: + self._path = path + + def process(self): + df = pd.read_csv(self._path, dtype={"Start Time(us)": str}) + # 分析是否存在可融合的算子 + op_type_list = df["Type"].tolist() + duration_list = df["Duration(us)"].tolist() + start_times = df["Start Time(us)"].tolist() + # 去除末尾的\t分隔符 + start_times = [start_time[:-1] for start_time in start_times] + result_list = [] + for pattern in Constant.PATTERN_DICT.keys(): + result_list.extend(self.find_all_sub_lists(op_type_list, duration_list, start_times, pattern)) + data_frame = pd.DataFrame(result_list) + data_frame.columns = ["pattern_name", "pattern", "len", "count", "duration sum(us)", "op durations(us)", + "index", "first_timestamp"] + return data_frame + + @staticmethod + def find_all_sub_lists(op_type_list, duration_list, start_times, expect_sub_list): + # 创建一个空字典,用来存储子列表和它们的出现次数和起始位置 + len_sub_list = len(expect_sub_list) + expect_sub_list = tuple(expect_sub_list) + sublist_dict = {} + # 遍历列表,从每个位置开始,取长度为N的子列表 + for i in range(len(op_type_list) - len_sub_list + 1): + sublist = tuple(op_type_list[i:i + len_sub_list]) + if sublist != expect_sub_list: + continue + # 如果子列表已经在字典中,就增加它的出现次数,否则就初始化为1 + if sublist in sublist_dict: + # count + sublist_dict[sublist][0] += 1 + # index + sublist_dict[sublist][1].append(i) + # total duration + sublist_dict[sublist][2] += sum(duration_list[i:i + len_sub_list]) + # duration + zip_data = zip(sublist_dict[sublist][3], duration_list[i:i + len_sub_list]) + sublist_dict[sublist][3] = [a + b for a, b in zip_data] + else: + sublist_dict[sublist] = [1, [i], sum(duration_list[i:i + len_sub_list]), + duration_list[i:i + len_sub_list], len_sub_list, start_times[i]] + # 创建一个空列表,用来存储所有重复的子列表 + repeated_sublists = [] + for sublist, (count, index, duration_sum, op_durations, sublist_len, first_time) in sublist_dict.items(): + pattern_name = Constant.PATTERN_DICT.get(sublist, "unknown") + op_durations = [round(num, 2) for num in op_durations] + repeated_sublists.append([pattern_name, sublist, sublist_len, count, + duration_sum, op_durations, index, first_time]) + if len(sublist_dict) == 0: + pattern_name = Constant.PATTERN_DICT.get(expect_sub_list, "unknown") + repeated_sublists.append([pattern_name, expect_sub_list, 0, 0, 0, 0, 0, 0]) + # 返回所有重复的子列表 + return repeated_sublists diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..fd2a72ffa39bfde1b3e59450c6d76f51d98110d9 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/json_analyzer.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from common_func_advisor.trace_view_json import TraceViewJson + + +class JSONAnalyzer(object): + def __init__(self, path): + self._path = path + + def get_custom_code(self, data: pd.DataFrame, ts_col: str, output_col: str): + trace_json = TraceViewJson(self._path) + callstacks = pd.DataFrame(columns=[output_col]) + + for i, row in data.iterrows(): + if ts_col not in data.columns.tolist(): + print("[ERROR] No {} col found in data columns.".format(ts_col)) + return callstacks + timestamp = row[ts_col] + flow_event = trace_json.get_torch_2_npu_flow_event(timestamp) + if not flow_event.valid(): + print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern'])) + callstacks.loc[i] = "" + continue + flow_event_s_key = flow_event.s_point_ts + python_dur_events = trace_json.get_python_dur_events_contain_ts(flow_event_s_key) + if not python_dur_events: + print("[ERROR] No python dur event found for pattern {}.".format(row['pattern'])) + callstacks.loc[i] = "" + continue + # 保持新老版本callstack兼容性 + if python_dur_events[0].args.get("Call stack"): + # 旧版本 + callstack = python_dur_events[0].args.get("Call stack").split(";") + else: + python_dur_events.sort(key=lambda e: e.ts) + # 新版本 + callstack = [event.name for event in python_dur_events if event.cat == "python_function"] + callstack_str = "\n".join(callstack) + callstacks.loc[i] = callstack_str + return callstacks diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py new file mode 100644 index 0000000000000000000000000000000000000000..7bcbed5a75807b57a55787c743cfaaff55a68589 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused/op_perf.py @@ -0,0 +1,196 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import functools +from typing import Dict + +from common_func_advisor.constant import Constant +from common_func_advisor.constant import CoreType +from common_func_advisor.constant import PerfColor + + +class OpPerfFactory: + @classmethod + def build(cls, op_row: Dict): + if op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIV: + return VecOpPerf(op_row) + elif op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIC: + return CubeOpPerf(op_row) + else: + return OpPerf(op_row) + + +class OpPerf: + def __init__(self, op_row: Dict): + if "OP Type" in op_row.keys(): + Constant.update_title() + self.row = op_row + self.model_name = op_row.get("Model Name") + self.model_id = op_row.get("Model ID") + self.task_id = op_row.get("Task ID") + self.stream_id = op_row.get("Stream ID") + self.infer_id = op_row.get("Infer ID") + self.op_name = op_row.get("Name") + self.op_type = op_row.get("Type") + self.task_type = op_row.get("Accelerator Core") + self.task_start_time = op_row.get("Start Time(us)") + self.task_duration = op_row.get("Duration(us)") + self.task_wait_time = op_row.get("Wait Time(us)") + self.block_dim = op_row.get("Block Dim") + self.mix_block_dim = op_row.get("Mix Block Dim") + + self.hf32_eligible = op_row.get("HF32 Eligible") + self.input_shapes = op_row.get("Input Shapes") + self.input_data_types = op_row.get("Input Data Types") + self.input_formats = op_row.get("Input Formats") + self.output_shapes = op_row.get("Output Shapes") + self.output_data_types = op_row.get("Output Data Types") + self.output_formats = op_row.get("Output Formats") + self.context_id = op_row.get("Context ID") + self.aicore_time = op_row.get("aicore_time(us)") + self.aic_total_cycles = op_row.get("aic_total_cycles") + + self.aic_mac_time = op_row.get("aic_mac_time(us)") + self.aic_mac_ratio = op_row.get("aic_mac_ratio") + self.aic_scalar_time = op_row.get("aic_scalar_time(us)") + self.aic_scalar_ratio = op_row.get("aic_scalar_ratio") + self.aic_mte1_time = op_row.get("aic_mte1_time(us)") + self.aic_mte1_ratio = op_row.get("aic_mte1_ratio") + self.aic_mte2_time = op_row.get("aic_mte2_time(us)") + self.aic_mte2_ratio = op_row.get("aic_mte2_ratio") + self.aic_fixpipe_time = op_row.get("aic_fixpipe_time(us)") + self.aic_fixpipe_ratio = op_row.get("aic_fixpipe_ratio") + self.aic_icache_miss_rate = op_row.get("aic_icache_miss_rate") + self.aiv_time = op_row.get("aiv_time(us)") + self.aiv_total_cycles = op_row.get("aiv_total_cycles") + self.aiv_vec_time = op_row.get("aiv_vec_time(us)") + self.aiv_vec_ratio = op_row.get("aiv_vec_ratio") + self.aiv_scalar_time = op_row.get("aiv_scalar_time(us)") + self.aiv_scalar_ratio = op_row.get("aiv_scalar_ratio") + self.aiv_mte2_time = op_row.get("aiv_mte2_time(us)") + + self.aiv_mte2_ratio = op_row.get("aiv_mte2_ratio") + self.aiv_mte3_time = op_row.get("aiv_mte3_time(us)") + self.aiv_mte3_ratio = op_row.get("aiv_mte3_ratio") + self.aiv_icache_miss_rate = op_row.get("aiv_icache_miss_rate") + self.cube_utilization = op_row.get("cube_utilization( %)") + + @staticmethod + def get_dtype_size(dtype_str: str): + return Constant.DTYPE_SIZE_MAP.get(dtype_str.lower(), 0) + + @staticmethod + def get_element_count(shape: list): + return functools.reduce(lambda x, y: int(x) * int(y), shape) + + @staticmethod + def shape_to_tuple(shape_str: str) -> tuple: + if not isinstance(shape_str, str): + return [] + shape_str = shape_str.strip('"') + split_shape = shape_str.strip(';') + if not split_shape: + return [] + pairs = split_shape.split(';') + shape_result = [] + for pair in pairs: + pair = pair.strip(";") + elements = pair.split(',') + elements = tuple(int(element) if "" != element else 0 for element in elements) + shape_result.append(elements) + return tuple(shape_result) + + @staticmethod + def dtype_to_tuple(dtypes_str: str) -> tuple: + if not isinstance(dtypes_str, str): + return [] + dtypes_str = dtypes_str.strip('"') + split_dtypes = dtypes_str.strip(';') + if not split_dtypes: + return [] + pairs = split_dtypes.split(';') + return tuple(pairs) + + def get_mac_ratio(self): + return self.aic_mac_ratio + + def get_size(self, shapes_str, dtypes_str): + shapes = self.shape_to_tuple(shapes_str) + dtypes = self.dtype_to_tuple(dtypes_str) + if len(shapes) > len(dtypes): + print(f"[ERROR] The size of shape is greater than that of dtypes.") + return 0 + if len(shapes) < len(dtypes): + shapes = list(shapes) + shapes.extend([(1,)] * (len(dtypes) - len(shapes))) + all_size = 0 + for index, shape in enumerate(shapes): + element_count = self.get_element_count(shape) + dtype_size = self.get_dtype_size(dtypes[index]) + all_size += element_count * dtype_size + return all_size + + def get_calc_size(self): + # input and output bytes (MB) + if not self.input_shapes or not self.output_shapes: + print("[ERROR] There is no tensor data, do not assess vector op performance.") + return 0 + intput_size = self.get_size(self.input_shapes, self.input_data_types) + output_size = self.get_size(self.output_shapes, self.output_data_types) + return (intput_size + output_size) / (Constant.BYTE_UNIT_TRANS * Constant.BYTE_UNIT_TRANS) + + def get_throughput(self): + # throughput(GB/s) + if not self.task_duration or abs(self.task_duration) < 1e-6: + print("[ERROR] There is no task_duration, do not assess vector op performance.") + return 0 + return self.row[Constant.TITLE.SIZE] / Constant.BYTE_UNIT_TRANS / self.task_duration * Constant.UNIT_TRANS * Constant.UNIT_TRANS + + def get_perf_color(self): + return PerfColor.WHITE + + def update(self): + self.row[Constant.TITLE.SIZE] = self.get_calc_size() + self.row[Constant.TITLE.THROUGHPUT] = self.get_throughput() + self.row[Constant.TITLE.COLOR] = self.get_perf_color().name + return self.row + + +class VecOpPerf(OpPerf): + def get_perf_color(self) -> PerfColor: + throughput = self.row[Constant.TITLE.THROUGHPUT] + op_duration = self.task_duration + tp_threshold = Constant.TP_THRESHOLD + if throughput == 0: + return PerfColor.WHITE + if throughput < tp_threshold / 2 and op_duration > 20: + return PerfColor.RED + elif tp_threshold / 2 <= throughput < tp_threshold: + return PerfColor.YELLOW + else: + return PerfColor.GREEN + + +class CubeOpPerf(OpPerf): + def get_perf_color(self) -> PerfColor: + aic_mac_ratio = self.get_mac_ratio() + if not aic_mac_ratio: + print("[WARNING] There is no aic_mac_ratio, do not assess cube op performance.") + return PerfColor.WHITE + elif aic_mac_ratio < 0.6: + return PerfColor.RED + elif 0.6 <= aic_mac_ratio < 0.8: + return PerfColor.YELLOW + else: + return PerfColor.GREEN diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..fd5610bbbbb98d15fbab22bb646b2dd7de36ac3d --- /dev/null +++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_fused_advice.py @@ -0,0 +1,71 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import ABC + +import pandas as pd + +from compute_advice.compute_advice_base import ComputeAdviceBase +from compute_advice.npu_fused.csv_analyzer import CSVAnalyzer +from compute_advice.npu_fused.json_analyzer import JSONAnalyzer + + +class NpuFusedAdvice(ComputeAdviceBase, ABC): + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.cur_data = dict() + self.cur_bottleneck = str() + self.cur_advice = str() + self.kernel_details_path = "" + self.call_stack = None + + def run(self): + if not self.path_check(): + return self.output_format_data + self.process() + self.output() + return self.output_format_data + + def process(self): + csv_analyzer = CSVAnalyzer(self.kernel_details_path) + all_pattern_data = csv_analyzer.process() + all_pattern_data = all_pattern_data.sort_values(by='duration sum(us)', ascending=False) + filter_data = all_pattern_data.get(all_pattern_data.get("duration sum(us)", 0) > 0) + if not self.has_callstack(): + print("[Warning] No call stack info found, advice will be incomplete") + self.cur_data = filter_data + else: + json_analyzer = JSONAnalyzer(self.trace_view_path) + custom_code = json_analyzer.get_custom_code(filter_data, "first_timestamp", "custom code") + self.cur_data = pd.concat([filter_data, custom_code], axis=1) + op_num = len(self.cur_data.index) + op_dur = filter_data["duration sum(us)"].sum() + if op_num > 0: + index = 0 + self.cur_bottleneck = f"The computing time of fusable op is {round(op_dur, 2)} ms." + self.cur_advice = "" + for _, row in self.cur_data.iterrows(): + advice = f"Advice {index}:\n" + cur_op = "[" + ", ".join(row.loc["pattern"]) + "]" + npu_fused_op = row.loc["pattern_name"] + advice += f"Replace {cur_op} with {npu_fused_op}. " + if self.call_stack: + advice += f"This pattern first happened in: \n{row['custom code']}" + if index != op_num - 1: + advice += "\n" + index += 1 + self.cur_advice += advice diff --git a/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py b/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..caff1c792c2171c33a4dd876b0741d6c215c5766 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/compute_advice/npu_slow_advice.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC +import multiprocessing + +import pandas as pd + +from compute_advice.compute_advice_base import ComputeAdviceBase +from compute_advice.npu_fused.op_perf import OpPerfFactory +from common_func_advisor.constant import Constant +from common_func_advisor.constant import PerfColor +from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson + + +class NpuSlowAdvice(ComputeAdviceBase, ABC): + OP_PERF_SHEET = "op_perf" + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.kernel_details_path = "" + self.data = pd.DataFrame() + + @staticmethod + def save_to_excel(data: pd.DataFrame, file_path: str) -> None: + writer = pd.ExcelWriter(file_path, engine="xlsxwriter", mode="w") + data.index.name = Constant.TITLE.INDEX + data.to_excel(writer, index=True, sheet_name=NpuSlowAdvice.OP_PERF_SHEET) + NpuSlowAdvice.color_sheet(data, writer.book, writer.sheets[NpuSlowAdvice.OP_PERF_SHEET]) + writer.sheets[NpuSlowAdvice.OP_PERF_SHEET].freeze_panes = "A2" + writer.close() + + @staticmethod + def color_sheet(data: pd.DataFrame, workbook, worksheet): + color_rgb = { + PerfColor.GREEN.name: workbook.add_format({'bg_color': '#C6EFCE'}), + PerfColor.YELLOW.name: workbook.add_format({'bg_color': '#FFEB9C'}), + PerfColor.RED.name: workbook.add_format({'bg_color': '#FFC7CE'}), + } + for row in data.iterrows(): + color = row[1][Constant.TITLE.COLOR] + fill_format = color_rgb.get(color) + if not fill_format: + continue + worksheet.set_row(row[0] + 1, None, fill_format) + + @staticmethod + def update_op_row(row: tuple): + return OpPerfFactory.build(row[1]).update() + + def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str: + if not self.has_callstack(): + print("There is no call stack info, please set 'with_stack=True'") + return "" + trace_json = TraceViewJson(self.trace_view_path) + return trace_json.get_call_stack(data, index_id, ts_col) + + def run(self): + if not self.path_check(): + return self.data + self.process() + return self.data + + def process(self): + self.data = pd.read_csv(self.kernel_details_path, dtype={"Start Time(us)": str}) + # 去除末尾的\t分隔符 + self.data["Start Time(us)"] = self.data["Start Time(us)"].apply(lambda x: x[:-1]) + pool = multiprocessing.Pool(multiprocessing.cpu_count()) + result = pool.map(self.update_op_row, self.data.iterrows()) + pool.close() + self.data = pd.DataFrame(result) diff --git a/profiler/advisor_review/advisor_backend/interface.py b/profiler/advisor_review/advisor_backend/interface.py new file mode 100644 index 0000000000000000000000000000000000000000..3e20c26d4d7bb000b20c28439b28ddf4811f057f --- /dev/null +++ b/profiler/advisor_review/advisor_backend/interface.py @@ -0,0 +1,62 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "advisor_backend")) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "compare_tools")) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "cluster_analyse")) +from common_func_advisor.constant import Constant +from advisor_backend.advice_factory.cluster_advice_factory import ClusterAdviceFactory +from advisor_backend.advice_factory.compute_advice_factory import ComputeAdviceFactory +from advisor_backend.advice_factory.timeline_advice_factory import TimelineAdviceFactory +from advisor_backend.advice_factory.overall_advice_factory import OverallAdviceFactory + + +class Interface: + def __init__(self, collection_path: str): + self.collection_path = os.path.realpath(collection_path) + self._factory_controller = FactoryController(collection_path) + + def get_data(self: any, mode: str, advice: str, **kwargs): + if len(mode) > Constant.MAX_INPUT_MODE_LEN or len(advice) > Constant.MAX_INPUT_ADVICE_LEN: + msg = '[ERROR]Input Mode is illegal.' + raise RuntimeError(msg) + factory = self._factory_controller.create_advice_factory(mode, kwargs.get("input_path", "")) + return factory.produce_advice(advice, kwargs) + + +class FactoryController: + FACTORY_LIB = { + Constant.CLUSTER: ClusterAdviceFactory, + Constant.COMPUTE: ComputeAdviceFactory, + Constant.TIMELINE: TimelineAdviceFactory, + Constant.OVERALL: OverallAdviceFactory + } + + def __init__(self, collection_path: str): + self.collection_path = os.path.realpath(collection_path) + self.temp_input_path = None + + def create_advice_factory(self, mode: str, input_path: str): + collection_path = input_path if input_path else self.collection_path + return self.FACTORY_LIB.get(mode)(collection_path) + + +if __name__ == "__main__": + Interface() diff --git a/profiler/advisor_review/advisor_backend/overall_advice/__init__.py b/profiler/advisor_review/advisor_backend/overall_advice/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..f5bfc351f2820ac8d797798fd959577da8062ea4 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/overall_advice/overall_summary_advice.py @@ -0,0 +1,176 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from advisor_backend.advice_base import AdviceBase +from compare_backend.utils.constant import Constant +from compare_interface.comparison_interface import ComparisonInterface + + +class OverallSummaryAdvice(AdviceBase): + advice_map = { + "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.", + "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.", + "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule." + } + time_name_map = { + "Computing Time": "computing", + "Uncovered Communication Time": "communication", + "Free Time": "free", + 'Cube Time(Num)': 'Cube Time', + 'Vector Time(Num)': 'Vector Time', + 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', + 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', + 'Other Time': "Other Computing Time", + 'SDMA Time(Num)': 'SDMA Time' + } + performance_time_dict = { + "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', + 'Flash Attention Time(Backward)(Num)', 'Other Time'], + "Uncovered Communication Time(Wait Time)": [], + "Free Time": ['SDMA Time(Num)'] + } + + def __init__(self, collection_path: str, kwargs: dict): + super().__init__(collection_path) + self.base_collection_path = kwargs.get("base_collection_path", "") + self._has_base_collection = False + self._is_minimal_profiling = False + self.cur_data = {} + self.cur_bottleneck = {} + self.cur_advices = "" + self._headers = [] + self._base_data = [] + self._comparison_data = [] + + @staticmethod + def split_duration_and_num(time_value: str) -> tuple: + split_data = time_value.split("s") # time value example: 0.229s(1756) + duration, num = 0.0, None + if len(split_data) >= 2: + try: + num = int(split_data[1].strip("()")) + except ValueError: + pass + if len(split_data) >= 1: + try: + duration = float(split_data[0]) + except ValueError: + print(f"[WARNING] Invalid time value: {time_value}.") + return duration, num + + @staticmethod + def calculate_ratio(dividend, divisor): + if not divisor: + return float("inf") + return dividend / divisor + + def run(self): + if self.path_check(): + self.process() + self.output() + self.identify_bottleneck() + return self.output_format_data + + def path_check(self): + if self.base_collection_path: + if os.path.exists(self.base_collection_path): + self._has_base_collection = True + else: + print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.") + return os.path.exists(self.collection_path) + + def process(self): + base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path + result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE) + for data in result_data.values(): + self._headers = data.get("headers", []) + rows = data.get("rows", []) + if len(rows) == 2: + self._base_data = rows[0] + self._comparison_data = rows[1] + if not self._headers or not self._comparison_data: + return + self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers + if self._has_base_collection: + self.cur_data["comparison_result"] = result_data + time_category_dict = {} + for time_category, time_list in self.performance_time_dict.items(): + time_value = self.get_time_value(time_category, self._comparison_data) + if time_value == Constant.INVALID_VALUE: + continue + duration, _ = self.split_duration_and_num(time_value) + time_category = time_category.split("(")[0] + time_category_dict[time_category] = duration + self.get_sub_category_time(time_category, time_list, duration) + self.cur_data["overall_data"] = time_category_dict + + def get_time_value(self, header_name: str, data_list: list): + try: + data_index = self._headers.index(header_name) + except ValueError: + return Constant.INVALID_VALUE + try: + time_value = data_list[data_index] + except IndexError: + return Constant.INVALID_VALUE + return time_value + + def get_sub_category_time(self, category: str, time_list: list, total_duration: float): + sub_time_dict = {} + for time_name in time_list: + time_value = self.get_time_value(time_name, self._comparison_data) + if time_value == Constant.INVALID_VALUE: + continue + sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, "")) + duration, num = self.split_duration_and_num(time_value) + sub_time_dict.setdefault(f"Duration(s)", []).append(duration) + sub_time_dict.setdefault(f"Duration Ratio", []).append( + "{:.2%}".format(self.calculate_ratio(duration, total_duration))) + sub_time_dict.setdefault(f"Kernel Number", []).append(num) + self.cur_data[self.time_name_map.get(category)] = sub_time_dict + + def identify_bottleneck(self): + overall_data = self.cur_data.get("overall_data") + if not overall_data: + return + e2e_time = '%.3f' % sum([data for data in overall_data.values()]) + overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n" + comparison_bottleneck = "" + for time_type, time_value in overall_data.items(): + # add subtype time bottleneck + advice = self.advice_map.get(time_type, "") + self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n{advice}" + # add overall bottleneck + overall_bottleneck += f" -- {time_type} is {time_value}s\n" + if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value, + e2e_time) > 0.1: + overall_bottleneck += "percentage of free time exceed the threshold 10%." + if not self._has_base_collection: + continue + # add comparison bottleneck + time_type_origin = "Uncovered Communication Time(Wait Time)" \ + if time_type == "Uncovered Communication Time" else time_type + base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data)) + if time_value > base_duration: + ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration)) + comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n" + self.cur_bottleneck["overall_data"] = overall_bottleneck + self.cur_bottleneck["comparison_result"] = comparison_bottleneck + + def output(self): + self.output_format_data[self.DATA] = self.cur_data + self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck + self.output_format_data[self.ADVICE] = self.cur_advices diff --git a/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py b/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor_review/advisor_backend/prof_bean_advisor/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py b/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..b108fc77a3f3408d48c79ce6b542f98427d88b0b --- /dev/null +++ b/profiler/advisor_review/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class ClusterStepTraceTimeBean: + STEP = "Step" + TYPE = "Type" + INDEX = "Index" + COMPUTING = "Computing" + COMMUNICATION = "Communication(Not Overlapped)" + FREE = "Free" + + def __init__(self, data: dict): + self._data = data + + @property + def step(self) -> str: + return self._data.get(self.STEP, '') + + @property + def type(self) -> str: + return self._data.get(self.TYPE, '') + + @property + def index(self) -> int: + try: + return int(self._data.get(self.INDEX)) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Index'." + raise ValueError(msg) from e + + @property + def compute(self) -> float: + try: + return float(self._data.get(self.COMPUTING, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Computing'." + raise ValueError(msg) from e + + @property + def communication(self) -> float: + try: + return float(self._data.get(self.COMMUNICATION, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Communication'." + raise ValueError(msg) from e + + @property + def free(self) -> float: + try: + return float(self._data.get(self.FREE, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Free'." + raise ValueError(msg) from e + diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py b/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor_review/advisor_backend/timeline_advice/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py b/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..9e492b2156c6faee6c023206f3cfc4f852eeb547 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/timeline_advice/op_schedule_advice.py @@ -0,0 +1,89 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from decimal import Decimal +from common_func_advisor.constant import Constant +from timeline_advice.timeline_advice_base import TimelineAdviceBase + + +class OpScheduleAdvice(TimelineAdviceBase): + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.cur_data = list() + self.cur_bottleneck = str() + self.cur_advice = str() + + def run(self): + if not self.path_check(): + return self.output_format_data + self.preparse() + self.process() + self.output() + return self.output_format_data + + def process(self): + cpt_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CPT] + free_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_FREE] + if not cpt_data or not free_data: + print("[ERROR] Fail to find Overlap data.") + return + + op_dur = [entry.get("dur", 0) for entry in cpt_data] + op_free = [0.0] * len(cpt_data) + merge_data = list() + merge_data.extend(cpt_data) + merge_data.extend(free_data) + merge_data.sort(key=lambda x : Decimal(x.get("ts"))) + idx = free_idx = 0 + while idx < len(merge_data) and free_idx < len(op_free): + entry = merge_data[idx] + entry_name = entry.get("name") + if entry_name == 'Free': + op_free[free_idx] = merge_data[idx].get('dur') + elif entry_name == 'Computing': + free_idx += 1 + idx += 1 + self.cur_data.append(op_dur) + self.cur_data.append(op_free) + free_ratio, cpt_ratio, _ = self.get_ratio() + if free_ratio < 0.2: + return + self.cur_bottleneck = f"NPU Utilication: {round(free_ratio * 100, 2)}%, " \ + f"NPU Free Utilization: {round(cpt_ratio * 100, 2)}%." + if len(self.preparse_data[self.PREPARSE_TYPE.SYNCHRONIZE]) > 1: + self.cur_advice = f"Device synchronize {len(self.preparse_data[self.PREPARSE_TYPE.SYNCHRONIZE])} times, " \ + "try to reduce synchronization statements to alleviate the bottleneck of operator delivery.\n" + small_op_num = self.small_op_block(op_free, op_dur) + small_op_ratio = small_op_num / len(op_dur) if op_dur else 0.0 + if small_op_ratio > Constant.SMALL_OP_NUM_RATIO: + self.cur_advice += "There are too many small operators, you can increase the batch size appropriately." + + def small_op_block(self, op_frees, op_durs): + small_op_num = 0 + for op_free, op_dur in zip(op_frees, op_durs): + if op_free > op_dur * Constant.SMALL_OP_DUR_RATIO: + small_op_num += 1 + return small_op_num + + def get_ratio(self): + cpt_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CPT] + free_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_FREE] + cmu_data = self.preparse_data[self.PREPARSE_TYPE.OVERLAP_CMU] + cpt_time = sum([x.get("dur", 0) for x in cpt_data]) + free_time = sum([x.get("dur", 0) for x in free_data]) + cmu_time = sum([x.get("dur", 0) for x in cmu_data]) + total_time = cpt_time + free_time + cmu_time + if total_time > 0.0: + return (free_time / total_time, cpt_time / total_time, cmu_time / total_time) + return (0.0, 0.0, 0.0) diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py b/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..dee2e7ba563d0d00b4459333dffb4099dee9240a --- /dev/null +++ b/profiler/advisor_review/advisor_backend/timeline_advice/optimizer_advice.py @@ -0,0 +1,55 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from timeline_advice.timeline_advice_base import TimelineAdviceBase + + +class OptimizerAdvice(TimelineAdviceBase): + OPTIMIZER_MAP = { + "Optimizer.step#SGD.step": "torch_npu.optim.NpuFusedSGD", + "Optimizer.step#Adadelta.step": "torch_npu.optim.NpuFusedAdadelta", + "Optimizer.step#Lamb.step": "torch_npu.optim.NpuFusedLamb", + "Optimizer.step#Adam.step": "torch_npu.optim.NpuFusedAdam", + "Optimizer.step#AdamW.step": "torch_npu.optim.NpuFusedAdamW", + "Optimizer.step#AdamP.step": "torch_npu.optim.NpuFusedAdamP", + "Optimizer.step#BertAdam.step": "torch_npu.optim.NpuFusedBertAdam", + "Optimizer.step#RMSprop.step": "torch_npu.optim.NpuFusedRMSprop", + "Optimizer.step#RMSpropTF.step": "torch_npu.optim.NpuFusedRMSpropTF", + } + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.cur_data = list() + self.cur_bottleneck = str() + self.cur_advice = str() + + def run(self): + if not self.path_check(): + return self.output_format_data + self.preparse() + self.process() + self.output() + return self.output_format_data + + def process(self): + if not self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER]: + return + + self.cur_data = list(set([entry.get("name", None) for entry in self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER]])) + for index, opt_name in enumerate(self.cur_data): + self.cur_advice += f"You can choose {self.OPTIMIZER_MAP.get(opt_name)} to replace the current Optimizer: {opt_name}." + if index != len(self.cur_data) - 1: + self.cur_advice += "\n" + self.cur_bottleneck = self.cur_advice diff --git a/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py b/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py new file mode 100644 index 0000000000000000000000000000000000000000..4c7ac96cd22673741accd6bb2abb463566a2e652 --- /dev/null +++ b/profiler/advisor_review/advisor_backend/timeline_advice/timeline_advice_base.py @@ -0,0 +1,99 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +from collections import defaultdict +import json +import os + +from advice_base import AdviceBase +from common_func.file_manager import FileManager + + +class TimelineAdviceBase(AdviceBase): + class PREPARSE_TYPE: + OPTIMIZER = 0 + STEP = 1 + OVERLAP_CPT = 2 + OVERLAP_FREE = 3 + OVERLAP_CMU = 4 + ENQUEUE = 5 + DEQUEUE = 6 + HOST_TO_DEVICE = 7 + SYNCHRONIZE = 8 + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.trace_view_path = "" + self.has_preparse = False + self.preparse_data = defaultdict(list) + self.entry_map = { + 'Computing': self.PREPARSE_TYPE.OVERLAP_CPT, + 'Free': self.PREPARSE_TYPE.OVERLAP_FREE, + 'AscendCL@aclrtSynchronizeDevice': self.PREPARSE_TYPE.SYNCHRONIZE + } + + def path_check(self): + """ + check whether input path is valid + """ + if not os.path.exists(self.collection_path): + print("[ERROR] Path: {} is not exist.".format(self.collection_path)) + return False + if os.path.isdir(self.collection_path) and self.collection_path.endswith("ascend_pt"): + self.trace_view_path = os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT", "trace_view.json") + if not os.path.exists(self.trace_view_path): + print("[ERROR] trace_view.json is not exist in the Path: {}.".format(os.path.join(self.collection_path, "ASCEND_PROFILER_OUTPUT"))) + return False + elif os.path.isfile(self.collection_path) and os.path.basename(self.collection_path) == "trace_view.json": + self.trace_view_path = self.collection_path + else: + print("[ERROR] Please input ascend_pt or trace_view.json.") + return False + print("[INFO] Start to analyse the target file: {}".format(self.trace_view_path)) + return True + + @abstractmethod + def run(self): + """ + analyze profiling data and advice + """ + + @abstractmethod + def output(self): + """ + output relevant data + """ + self.output_format_data[self.DATA] = self.cur_data + self.output_format_data[self.BOTTLENECK] = self.cur_bottleneck + self.output_format_data[self.ADVICE] = self.cur_advice + + def preparse(self): + if self.has_preparse: + return + json_reader = FileManager.read_json_file(self.trace_view_path) + if not isinstance(json_reader, list): + return + for entry in json_reader: + name = entry.get("name", None) + if not name: + continue + if name.startswith("Optimizer.step#") and name.endswith(".step"): + self.preparse_data[self.PREPARSE_TYPE.OPTIMIZER].append(entry) + elif name.startswith("ProfilerStep#"): + self.preparse_data[self.PREPARSE_TYPE.STEP].append(entry) + elif name in self.entry_map: + self.preparse_data[self.entry_map[name]].append(entry) + self.has_preparse = True diff --git a/profiler/advisor_review/analyzer/__init__.py b/profiler/advisor_review/analyzer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/base_analyzer.py b/profiler/advisor_review/analyzer/base_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..5f4bd3202cd2071088f25564a7d4b14144a34826 --- /dev/null +++ b/profiler/advisor_review/analyzer/base_analyzer.py @@ -0,0 +1,94 @@ +import logging +from functools import wraps +from typing import Dict, List, Union +from abc import abstractmethod, ABCMeta + +from profiler.advisor.common import constant +from profiler.advisor.common.version_control import VersionControl +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.display.html.render import HTMLRender + +logger = logging.getLogger() + + +class BaseAnalyzer(VersionControl, metaclass=ABCMeta): + _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION + + dataset_cls_list = [] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + self.n_processes = n_processes + self.cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION) + self.torch_version = kwargs.get("torch_version", constant.DEFAULT_TORCH_VERSION) + self.html_render = HTMLRender() + self.collection_path = collection_path + self.kwargs = kwargs + self.dataset_list: Dict[str, List[Dataset]] = {} + self.init_dataset_list() + self.result = OptimizeResult() + self.record_list: Dict[str, List] = {} + + @classmethod + def check_data(cls, data_list: tuple): + """ + check if all data in data list is contained + :param data_list: data list to check + :return: func ptr if check success + """ + + def decorate(func): + + @wraps(func) + def wrapper(self, **kwargs): + data = self.dataset_list + if data is None: + return None + for data_key in data_list: + if data_key not in data: + return None + + logger.info("Enable analysis %s with %s", self.__class__.__name__, ",".join(data_list)) + return func(self) + + return wrapper + + return decorate + + @abstractmethod + def optimize(self, **kwargs): + pass + + @abstractmethod + def make_record(self): + pass + + @abstractmethod + def make_render(self): + pass + + def init_dataset_list(self)->None: + dataset_cls_list = self.dataset_cls_list + if len(dataset_cls_list) == 0: + logger.warning(f"Analyser: %s don't rely on any dataset!", self.__class__.__name__) + return + + for dataset_cls in dataset_cls_list: + if dataset_cls and callable(dataset_cls): + dataset = dataset_cls(collection_path=self.collection_path, data=self.dataset_list, **self.kwargs) + key = dataset_cls.get_key() + if key not in self.dataset_list: + self.dataset_list[key] = [] + self.dataset_list[key].append(dataset) + + @staticmethod + def get_first_data_by_key(data, key) -> Union[Dataset, None]: + """ + get the first member from data with key + :param data: input data + :param key: data key + :return: the first dataset in dataset list + """ + if key in data and len(data[key]) > 0: + return data[key][0] + return None diff --git a/profiler/advisor_review/analyzer/cluster/__init__.py b/profiler/advisor_review/analyzer/cluster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py b/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py new file mode 100644 index 0000000000000000000000000000000000000000..846b79a50f31abb8445a0e5c2e82aaaf3c8ee23d --- /dev/null +++ b/profiler/advisor_review/analyzer/cluster/slow_link_analyser.py @@ -0,0 +1,122 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import Dict, List +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common import constant +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet + + +class SlowLinkAnalyzer(BaseAnalyzer): + RDMA_TIME_MS = "RDMA time(ms)" + RDMA_SIZE_MB = "RDMA size(mb)" + SDMA_TIME_MS = "SDMA time(ms)" + SDMA_SIZE_MB = "SDMA size(mb)" + RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)" + SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)" + COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info" + TRANSIT_TIME = "Transit Time(ms)" + TRANSIT_SIZE = "Transit Size(MB)" + SDMA = "SDMA" + RDMA = "RDMA" + SLOW_LINK_ANALYSIS = "slow_link_analysis" + dataset_cls_list = [ClusterCommunicationDataSet] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) + key = ClusterCommunicationDataSet.get_key() + self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key) + self.rank_bw_dict = self.communication_data_class.get_data() + self.result = OptimizeResult() + self.bottelneck = '' + self.suggestion = '' + self.format_datas = [] + + def optimize(self, **kwargs): + if self.rank_bw_dict is None: + print("slow_link 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹, \ + 如不关心这类数据请忽略") + return self.result + self.process() + self.format_datas = self.format_details() + self.make_record() + self.make_render() + return self.result + + def process(self): + if self.rank_bw_dict: + self.produce_bottleneck(self.RDMA_BANDWIDTH) + self.produce_bottleneck(self.SDMA_BANDWIDTH) + + def produce_bottleneck(self, link_type: str): + data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()] + avg_bw = round(sum(data_list) / len(data_list), 3) + if avg_bw == 0: + return + self.bottelneck += f'{link_type}: \n' \ + f' The average is {avg_bw}, \n' \ + f' while the maximum is {round(max(data_list), 3)}GB/s \n' \ + f' and the minimum is {round(min(data_list), 3)}GB/s. \n' \ + f' the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n' + + def format_details(self): + if not self.rank_bw_dict: + return { + "headers": [], + "data": [] + } + + details_dict = {} + headers = list({k for rank_bw_value in self.rank_bw_dict.values() for k in rank_bw_value.keys()}) + headers.sort() + data_list = [[rank_id] + [rank_bw.get(k, 0) for k in headers] for rank_id, rank_bw in self.rank_bw_dict.items()] + data_list.sort(key = lambda x: x[0]) # 按rank_id排序 + + details_dict["headers"] = ["rank_id"] + headers + details_dict["data"] = data_list + + return details_dict + + def make_record(self): + """ + make record for what and how to optimize + """ + optimization_item = OptimizeItem( + SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, + self.bottelneck, + self.suggestion + ) + self.result.add(OptimizeRecord(optimization_item)) + + for i, data in enumerate(self.format_datas["data"]): + self.result.add_detail(SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, self.format_datas["headers"], data) + + def make_render(self): + result_for_html = { + "Description" : self.bottelneck, + "suggestion" : self.suggestion, + "details" : [self.format_datas] + } + + self.html_render.render_template(key="cluster", + title=SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, + template_dir="templates", + template_name="cluster_analysis.html", + cann_version=self.cann_version, + torch_version=self.torch_version, + result=result_for_html) \ No newline at end of file diff --git a/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py new file mode 100644 index 0000000000000000000000000000000000000000..4215b514a215a2a350571746ff9cb90c3c9956eb --- /dev/null +++ b/profiler/advisor_review/analyzer/cluster/slow_rank_analyser.py @@ -0,0 +1,112 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import Dict, List +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common import constant +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataSet + + +class SlowRankAnalyzer(BaseAnalyzer): + SLOW_RANK_ANALYSIS = "slow_rank_analysis" + RANK = "rank" + RATIO_THRESHOLD = 0.05 + BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] + dataset_cls_list = [ClusterStepTraceTimeDataSet] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) + key = ClusterStepTraceTimeDataSet.get_key() + self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key) + self.step_trace_dict = self.step_trace_class.get_data() + self.result = OptimizeResult() + self.bottelneck = '' + self.suggestion = '' + self.format_datas = [] + + def optimize(self, **kwargs): + if self.step_trace_dict is None: + print("slow_rank 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹 \ + 如不关心这类数据请忽略") + return self.result + self.process() + self.format_datas = self.format_details() + self.make_record() + self.make_render() + return self.result + + def process(self): + total_time_list = [sum(data_tuple) for rank_id, data_tuple in self.step_trace_dict.items()] + if total_time_list: + mean_total_time = sum(total_time_list) / len(total_time_list) + for i in range(len(self.BOTTLENECK_LIST)): + self.produce_bottleneck(self.step_trace_dict, i, mean_total_time) + + def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float): + data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()] + max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time) + if max_ratio > self.RATIO_THRESHOLD: + self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} \n' \ + f' has some issues in the cluster, \n' \ + f' because the max difference of {self.BOTTLENECK_LIST[produce_type]} time \n' \ + f' has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n' + + def make_record(self): + """ + make record for what and how to optimize + """ + optimization_item = OptimizeItem( + SlowRankAnalyzer.SLOW_RANK_ANALYSIS, + self.bottelneck, + self.suggestion + ) + self.result.add(OptimizeRecord(optimization_item)) + for i, data in enumerate(self.format_datas["data"]): + self.result.add_detail(SlowRankAnalyzer.SLOW_RANK_ANALYSIS, self.format_datas["headers"], data) + + def format_details(self): + details_dict = {} + headers = ["rank_id", "compute", "communication", "free"] + data_list = [] + for key,value in self.step_trace_dict.items(): + data_list.append([key] + value) + details_dict["headers"] = headers + details_dict["data"] = data_list + return details_dict + + def make_render(self): + result_for_html = { + "Description" : self.bottelneck, + "suggestion" : self.suggestion, + "details" : [self.format_datas] + } + + self.html_render.render_template(key="cluster", + title=SlowRankAnalyzer.SLOW_RANK_ANALYSIS, + template_dir="templates", + template_name="cluster_analysis.html", + cann_version=self.cann_version, + torch_version=self.torch_version, + result=result_for_html) + + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean diff --git a/profiler/advisor_review/analyzer/communication/__init__.py b/profiler/advisor_review/analyzer/communication/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/communication/bandwidth/__init__.py b/profiler/advisor_review/analyzer/communication/bandwidth/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/communication/environment/__init__.py b/profiler/advisor_review/analyzer/communication/environment/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/computation/__init__.py b/profiler/advisor_review/analyzer/computation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/computation/aicpu/__init__.py b/profiler/advisor_review/analyzer/computation/aicpu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..4eca1c6c0278349cf4068544d2a53d8de7f0d5e1 --- /dev/null +++ b/profiler/advisor_review/analyzer/computation/aicpu/aicpu_checker.py @@ -0,0 +1,278 @@ +import copy +import os +from functools import partial +from typing import List, Dict, Optional + +import yaml +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker, logger +from profiler.advisor.analyzer.schedule.fusion_ops.timeline_api_stack_checker import OpStackFinder +from profiler.advisor.common import constant +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset + + +class AicpuChecker(OperatorChecker): + _CHECKER = "aicpu operator" + _PROBLEM = "AICPU operator" + _MIN_TASK_DURATION = 20 + _description = f"Some operators and task duration exceed {_MIN_TASK_DURATION} us, such as :\n" + _SUGGESTION: List[str] = ["Modify code to avoid aicpu operator"] + STACK_INFO_ITEMS = "stack_info" + SUGGESTION_INFO_ITEMS = "suggestions" + _ITEMS = [ + "op_name", "op_type", "task_duration", "input_shapes", "input_data_types", "input_formats", "output_shapes", + "output_data_types", "output_formats" + ] + + def __init__(self, cann_version): + super(AicpuChecker, self).__init__(cann_version=cann_version) + self.aicpu_rules: Dict = {} + self.aicpu_checker: Dict = {} + self.load_aicpu_rules() + + def _check_data(self, profiling_data: ProfilingDataset) -> bool: + if not self._check_summary(profiling_data): + return False + return True + + def _check_operator(self, op_info) -> bool: + return op_info.task_type == constant.AI_CPU + + def load_aicpu_rules(self, rule_path="rules/aicpu_rules.yaml") -> Dict: + if not os.path.isabs(rule_path): + rule_path = os.path.join(os.path.dirname(__file__), + "../../../", rule_path) + + if not os.path.exists(rule_path): + logger.warning("Skip analyze aicpu issues, because %s does not exist.", rule_path) + return {} + with open(rule_path, 'r') as f: + self.aicpu_rules = yaml.safe_load(f) + self.filter_aicpu_rules(self.aicpu_rules) + for checker_name, check_rule in self.aicpu_rules.items(): + if not isinstance(check_rule, (list, dict,)): + continue + + if checker_name not in AICPU_CHECKER.keys(): + logger.warning("Skip %s, which is not support now.", checker_name) + continue + + self.aicpu_checker[checker_name] = AICPU_CHECKER[checker_name](check_rule) + + def filter_aicpu_rules(self, aicpu_rules): + support_checkers = [] + for checkers in aicpu_rules['CommonChecker']: + for key, value in checkers.items(): + if key == 'DataTypeChecker' and self.cann_version in value['cann_version']: + support_checkers.append(checkers) + aicpu_rules['CommonChecker'] = support_checkers + return + + def check_aicpu_attr(self, op_info) -> List[str]: + suggestions = [] + for _, checker in self.aicpu_checker.items(): + suggestions.extend(checker.check(op_info)) + return suggestions + + def check(self, profiling_data: ProfilingDataset) -> bool: + """ + check if any operator need optimize + :param profiling_data: profiling datasest + :return: true or false + """ + + if not self._check_data(profiling_data): + return False + op_summary = profiling_data.op_summary + + def get_opeartor_stack_info(api_stack_finder: OpStackFinder, op_name_list: list) -> list: + data: Dict[str, Dataset] = {} + event_dataset = TimelineEventDataset(collection_path=profiling_data.collection_path, data=data, task_type=constant.AI_CPU) + + # disable multiprocessing, avoid cost time of enable new process for light task + api_stack_finder.get_api_stack_by_op(event_dataset, op_name_list, constant.AI_CPU, + disable_multiprocess=True) + return api_stack_finder._stack_record + + self._op_list = [] + total_task_duration = 0.0 + max_task_duration = 0.0 + for op_info in op_summary.op_list: + if self._check_operator(op_info): + self._op_list.append(op_info) + + task_duration = float(op_info.task_duration) + total_task_duration += task_duration + max_task_duration = max(max_task_duration, task_duration) + if (not self._op_list) or (max_task_duration < self._MIN_TASK_DURATION): + return False + + # 获取所有算子堆栈的信息 + op_name_list = [] + for op in self._op_list: + if op.op_name not in op_name_list: + op_name_list.append(op.op_name) + api_stack_finder = OpStackFinder() + stack_record = get_opeartor_stack_info(api_stack_finder, op_name_list) + + # task_id 到 stack 信息的对应 + self._op_list.sort(key=lambda x: int(x.task_id)) + stack_record.sort(key=lambda x: x[0]) + task_id_to_stack = dict() + for stack in stack_record: + task_id_to_stack[stack[0]] = stack[-1] + + # 算子追加堆栈属性 + for op in self._op_list: + stack = task_id_to_stack.get(int(op.task_id)) + op.add_attr(self.STACK_INFO_ITEMS, stack) + suggestions = self.check_aicpu_attr(op) + op.add_attr(self.SUGGESTION_INFO_ITEMS, suggestions) + + # double 类型算子判断 + double_type_ai_cpu_operator = [] + for op in self._op_list: + if not op.has_attr("input_data_types"): + logger.warning( + "Skip checking of input data in AICPU checker because of not containing input_data_dtypes in op summary") + break + if op.has_attr( + "input_data_types") and "DOUBLE" in op.input_data_types and op.op_name not in double_type_ai_cpu_operator: + double_type_ai_cpu_operator.append(op.op_name) + if bool(double_type_ai_cpu_operator): + self._SUGGESTION.append("Try to convert double type operator to float, such as {}".format( + ",".join(double_type_ai_cpu_operator))) + return True + + def make_render(self, html_render, record): + html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_ai_cpu.html", + format_result=self.format_operator_result(record, constant.OPERATOR_LIST_UNLIMIT)) + + def format_operator_result(self, record, limit): + """ + Format operator result to html + :param record: profiling check record + :param limit: Limit number of operator statistics lists. + :return: + """ + optimization_item = record.optimization_item + release_suggestion_list = [] + for suggestion in optimization_item.suggestion: + release_suggestion_list.append(suggestion.replace('\n', '
')) + logger.debug("suggestion list is %s", release_suggestion_list) + format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list), + "task_duration": round(record.statistics_item.task_duration, 2)} + + statistic = self.group_by(copy.deepcopy(self._op_list), op_key='op_type', + limit=limit) + format_result["statistic"] = statistic + stack_key_list = ["stack_info", "input_data_types", "output_data_types"] + if statistic: + for key, info in statistic: + op_info_list = self.group_by_list(info.get("op_info_list"), stack_key_list, limit) + info["op_info_list"] = op_info_list + return format_result + + def group_by_list(self, op_list, op_key_list: List = ["stack_info", "input_data_types", "output_data_types"], + limit: int = constant.OPERATOR_LIST_UNLIMIT): + if op_list is None: + op_list = [] + + # op_key_list 合并添加合并的属性,作为 groupby 的 key value + op_key = '+'.join(op_key_list) # str, json + for op_info in op_list: + attribute = "" + for _op in op_key_list: + if op_info.get_attr(_op): + attribute += op_info.get_attr(_op) + op_info.add_attr(op_key, attribute) + + return self.group_by(op_list, op_key=op_key, limit=limit) + + +class BaserChecker: + def __init__(self, *args, **kwargs): + self.checker_list = [] + + def build(self): + raise NotImplementedError + + def check(self, op_info) -> List[str]: + suggestions = [] + for checker in self.checker_list: + suggestion = checker(op_info) + if suggestion is not None: + suggestions.append(suggestion) + return suggestions + + +class CommonChecker(BaserChecker): + def __init__(self, check_rules: List[Dict] = None): + super(CommonChecker, self).__init__() + self.check_rules = check_rules if check_rules is not None else [] + self.supported_checker = dict(DataTypeChecker=self.datatype_checker) + self.build() + + @staticmethod + def datatype_checker(check_item: Dict, op_info) -> Optional[str]: + supported_op_type = check_item.get('op_type', []) + suggestion = check_item.get('suggestion', "") + valid_inputs = check_item.get('input', []) + valid_outputs = check_item.get('output', []) + ignore_type = check_item.get('ignore_type', []) + op_type = getattr(op_info, 'op_type', "UNKNOWN") + if "__ALL__" in supported_op_type or \ + op_type.lower() in supported_op_type: + if op_type.lower() in ignore_type: + return None + + op_input_dtype = getattr(op_info, 'input_data_types', "").split(";") + op_input_dtype = [item.lower() for item in op_input_dtype] + op_output_dtype = getattr(op_info, 'output_data_types', "").split(";") + op_output_dtype = [item.lower() for item in op_output_dtype] + input_dtype_diff = set(op_input_dtype).difference(set(valid_inputs)) + output_dtype_diff = set(op_output_dtype).difference(set(valid_outputs)) + unsupported_dtype_diff = input_dtype_diff.union(output_dtype_diff) + if not unsupported_dtype_diff: + return None + + return suggestion.format(",".join(unsupported_dtype_diff).upper(), + op_type, + ",".join(valid_inputs).upper()) + + def build(self): + for check in self.check_rules: + (check_func, check_rule), = check.items() + if check_func not in self.supported_checker: + logger.warning("Skip %s, which has not been implemented.", check_func) + continue + self.checker_list.append(partial(self.supported_checker.get(check_func), check_rule)) + + +class ExampleGuideChecker(BaserChecker): + def __init__(self, check_rules: List[Dict] = None): + super(ExampleGuideChecker, self).__init__() + self.check_rules = check_rules if check_rules is not None else [] + self.build() + + def build(self): + def _guide_url(check_item: Dict, op_info) -> Optional[str]: + supported_op_type = check_item.get('op_type', []) + url = check_item.get('url', "") + suggestion = check_item.get('suggestion', "") + + if getattr(op_info, 'op_type', "UNKNOWN").lower() in supported_op_type: + return suggestion if "{}" not in suggestion else suggestion.format(url) + + for check in self.check_rules: + (_, check_rule), = check.items() + self.checker_list.append(partial(_guide_url, check_rule)) + + +AICPU_CHECKER = { + "CommonChecker": CommonChecker, + "ExampleGuideChecker": ExampleGuideChecker +} diff --git a/profiler/advisor_review/analyzer/computation/bound/__init__.py b/profiler/advisor_review/analyzer/computation/bound/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..a7d7ddd93c70e59dc0d10318fdac06fdc581f70c --- /dev/null +++ b/profiler/advisor_review/analyzer/computation/bound/block_dim_checker.py @@ -0,0 +1,75 @@ +import logging + +from typing import List + +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.common import constant +from profiler.advisor.config.config import Config +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset + +logger = logging.getLogger() + + +class BlockDimChecker(OperatorChecker): + _SUGGESTION: List[str] = [] + _CHECKER = "block dim" + _PROBLEM = "block dim" + _description = "some operator does not make full use of {} ai core" + _ITEMS = [ + "op_name", "op_type", "task_type", "task_duration", "income", "block_dim", "mix_block_dim", "input_shapes", + "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats" + ] + + def pre_check(self, profiling_data) -> bool: + return not self.is_dynamic_shape(profiling_data) + + def _check_data(self, data): + self.format_suggestion_content(data) + if not self._check_summary(data): + return False + if not Config().get_config("ai_core_num"): + logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ai core num in info.json file") + return False + summary = data.op_summary + op_info = summary.op_list[0] + if not hasattr(op_info, "block_dim"): + logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "block dim in op summary") + return False + if Config().get_config("ai_core_num"): + self._aicore_num = int(Config().get_config("ai_core_num")) + if Config().get_config("aiv_num"): + self._aiv_num = int(Config().get_config("aiv_num")) + self._description = self._description.format(self._aicore_num) + if self._aiv_num: + self._description += f" or {self._aiv_num} ai vector core" + self._description += f";\n Top-{OperatorChecker._MAX_TUNE_OP_NUM} operator of " \ + "task duration are as follows:\n" + return True + + def make_render(self, html_render, record): + html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_block_dim.html", + format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK)) + + def _check_operator(self, op_info) -> bool: + if op_info.task_type not in ["AI_CORE", "AI_VECTOR_CORE", "MIX_AIC"]: + return False + block_dim = int(op_info.block_dim) + core_num = self.get_core_num(op_info) + if block_dim % core_num == 0: + return False + if op_info.task_type == "MIX_AIC" and hasattr(op_info, "mix_block_dim") \ + and self._aiv_num and int(op_info.mix_block_dim) % self._aiv_num == 0: + return False + return True + + def get_core_num(self, op_info): + """ + get core num of task type + """ + if op_info.task_type == "AI_CORE" or not self._aiv_num: + core_num = self._aicore_num + else: + core_num = self._aiv_num + return core_num diff --git a/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..a22b380f974b14207d6d7be262cd49f0ba0fbe99 --- /dev/null +++ b/profiler/advisor_review/analyzer/computation/bound/operator_bound_checker.py @@ -0,0 +1,53 @@ +import logging +from typing import List + +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.common import constant +from profiler.advisor.config.config import Config +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.utils.utils import to_percent + +logger = logging.getLogger() + + +class OperatorBoundChecker(OperatorChecker): + _MIN_TASK_DURATION = 20 # min task duration 20us + _CHECKER = "operator no bound" + _PROBLEM = "operator no bound" + _SUGGESTION: List[str] = [] + _description = ( + f"There is no mte, cube, vector, scalar ratio is more than {to_percent(Config().operator_bound_ratio)};\n" + + f"Top task duration operators need to be tuned are as follows: \n") + _ITEMS = [ + "op_name", "op_type", "task_type", "task_duration", "vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio", + "mte2_ratio", "mte3_ratio", "block_dim", "input_shapes", "input_data_types", "input_formats", "output_shapes", + "output_data_types", "output_formats" + ] + + def pre_check(self, profiling_data) -> bool: + return not self.is_dynamic_shape(profiling_data) + + def _check_data(self, data): + self.format_suggestion_content(data) + if not self._check_summary(data): + return False + for op_info in data.op_summary.op_list: + return self._check_operator(op_info) + + logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ratio in op summary") + return False + + def _check_operator(self, op_info) -> bool: + bound_list = ["vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio", "mte2_ratio", "mte3_ratio"] + ratio_list = [self.get_ratio(op_info, attr) for attr in bound_list] + if not any(ratio_list): + return False # no data, skip check + if any(ratio and ratio > Config().operator_bound_ratio for ratio in ratio_list): + return False + return True + + def make_render(self, html_render, record): + html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_no_bound.html", + format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK)) diff --git a/profiler/advisor_review/analyzer/computation/op_compile/__init__.py b/profiler/advisor_review/analyzer/computation/op_compile/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..86d3bac4ff8cb163d23a6365307b855839b12a6a --- /dev/null +++ b/profiler/advisor_review/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -0,0 +1,65 @@ +import copy +import logging +from typing import List + +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.common import constant +from profiler.advisor.dataset.profiling.info_collection import OpInfo +from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord + +logger = logging.getLogger() + + +class DynamicShapeChecker(OperatorChecker): + ENABLE_COMPILED_SUGGESTION = "Optimize by enabling compiled operator, such as:\n" \ + "`torch_npu.npu.set_compile_mode(jit_compile=False)`\n" + _SUGGESTION: List[str] = [ENABLE_COMPILED_SUGGESTION] + _CHECKER = "dynamic shape operator" + _PROBLEM = "Dynamic shape operator" + _description = f"Found all operators are dynamic shape" + _op_list: List[OpInfo] = [] + _tune_op_list: List[str] = [] # record op name to be tuned, and save to tune_ops_file.cfg + _op_views: List = [] + + def __init__(self, cann_version) -> None: + super().__init__(cann_version=cann_version) + + def check(self, profiling_database) -> bool: + return self.is_dynamic_shape(profiling_database) + + def make_record(self, profiling_database) -> OptimizeRecord: + """ + make record for what and how to optimize + """ + + optimization_item = OptimizeItem( + self._PROBLEM, + self._description, + self._SUGGESTION + ) + statistics_item = StatisticsItem("", "", 1) + return OptimizeRecord(optimization_item, statistics_item) + + def format_operator_result(self, record, limit=-1): + """ + Format operator result to html + :param record: profiling check record + :param limit: Limit number of operator statistics lists. + :return: + """ + optimization_item = record.optimization_item + release_suggestion_list = [] + for suggestion in optimization_item.suggestion: + release_suggestion = copy.deepcopy(suggestion) + if release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION: + release_suggestion += \ + f"for details please refer to link : LINK" + release_suggestion_list.append(release_suggestion.replace('\n', '
')) + format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)} + return format_result + + def make_render(self, html_render, record): + html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_dynamic_shape.html", + format_result=self.format_operator_result(record)) diff --git a/profiler/advisor_review/analyzer/computation/operator_checker.py b/profiler/advisor_review/analyzer/computation/operator_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..0f47650943a7355b494bd766214d10526c46c0fa --- /dev/null +++ b/profiler/advisor_review/analyzer/computation/operator_checker.py @@ -0,0 +1,307 @@ +import copy +import logging +from textwrap import fill +from typing import List + +from profiler.advisor.common import constant +from profiler.advisor.common.version_control import VersionControl +from profiler.advisor.config.config import Config +from profiler.advisor.dataset.profiling.info_collection import OpInfo +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord +from profiler.advisor.utils.utils import safe_division + +logger = logging.getLogger() + + +class OperatorChecker(VersionControl): + _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION + _MAX_TUNE_OP_NUM = constant.OPERATOR_OUT_TOPK + _MIN_TASK_DURATION = 0 + _MIN_TASK_DURATION_RATIO = 1.0 + _MIN_TOTAL_DURATION_RATIO = 1.0 + _CHECKER = str() + _PROBLEM = str() + _description = str() + STACK_INFO_ITEMS = "" + _ITEMS: List[str] = [] + _SUGGESTION: List[str] = [] + SKIP_CHECK_MSG = "Skip %s checker because of not containing %s" + _tune_op_info_list: List[OpInfo] = [] + PyTorch_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE, such as:\n" \ + f"'aoe --job_type=2 --model_path=$user_dump_path " \ + f"--tune_ops_file={Config().tune_ops_file}'\n" + MSLite_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE in mindspore lite framework, such as:\n" \ + f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ + f"--modelFile=$user_model.onnx --outputFile=user_model --configFile=./config.txt\n" + _tune_op_list: List[str] = [] + + def __init__(self, cann_version: str): + self.cann_version = cann_version + self._op_list: List[OpInfo] = [] + + def check(self, profiling_data: ProfilingDataset) -> bool: + """ + check if any operator need optimize + :param profiling_data: profiling datasest + :return: true or false + """ + if not self._check_data(profiling_data): + return False + + summary = profiling_data.op_summary + total_task_duration = 0.0 + max_task_duration = 0.0 + for op_info in summary.op_list: + if not self._check_operator(op_info): + continue + task_duration = float(op_info.task_duration) + total_task_duration += task_duration + max_task_duration = max(max_task_duration, task_duration) + self._op_list.append(op_info) + if task_duration > self._MIN_TASK_DURATION: + self._tune_op_info_list.append(op_info) + + if any([ + max_task_duration > self._MIN_TASK_DURATION, + round(safe_division(max_task_duration, summary.get_total_task_duration()), + 4) > self._MIN_TASK_DURATION_RATIO, + round(safe_division(total_task_duration, summary.get_total_task_duration()), 4) > + self._MIN_TOTAL_DURATION_RATIO, + ]): + self._op_list.sort(key=lambda x: float(x.get_attr("task_duration")), reverse=True) + self._tune_op_info_list.sort(key=lambda x: float(x.get_attr("task_duration")), reverse=True) + for op in self._op_list: + if op.op_name not in self._tune_op_list and len(self._tune_op_list) < constant.OPERATOR_OUT_TOPK: + self._tune_op_list.append(op.op_name) + return True + return False + + def make_record(self, profiling_data: ProfilingDataset): + """ + Make record for what and how to optimize + :param profiling_data: profiling data + :return: optimize record + """ + task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list if + hasattr(op_info, "get_attr")] + total_cost_time = sum(task_duration_list) + total_task_duration = profiling_data.op_summary.get_total_task_duration() + count = len(task_duration_list) + statistics_item = StatisticsItem(total_task_duration, total_cost_time, count, self.get_incomes()) + optimization_item = OptimizeItem( + self._PROBLEM, + self._get_description(self._description, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]), + self._SUGGESTION + ) + return OptimizeRecord(optimization_item, statistics_item) + + def _get_description(self, description, op_type_list=None): + if not op_type_list: + return description + + desc_suffix = [] + for i in range(len(op_type_list)): + if i % 3 == 0 and i != 0: + desc_suffix.append("\n") + + desc_suffix.append(f"{op_type_list[i]}") + + if i < len(op_type_list) - 1: + desc_suffix.append(", ") + + description += "".join(desc_suffix) + return description + + def pre_check(self, profiling_data) -> bool: + return True + + def is_dynamic_shape(self, profiling_database: ProfilingDataset) -> bool: + less_than_cann800_list = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15] + # CANN 8.0.0 之前从 ge_info 中获取 op_state 属性,进行动态 shape 逻辑判断 + if self.cann_version in less_than_cann800_list: + if hasattr(profiling_database, "ge_info"): + ge_info = profiling_database.ge_info + static_shape_operators = ge_info.get_static_shape_operators() + if len(static_shape_operators) == 0: + return True + else: + logger.warning( + "Skip dynamic shape check because of not containing ge_info.db file in host filefloder.\n" + "To enable dynamic shape check, please try to set data_simplification=False in experimental_config.\n" + "More details please refer to link : %s", constant.ASCEND_PROFILER_URL) + else: + # CANN 8.0.0 之后 op_state 属性从 op_summary 文件中获取 + if hasattr(profiling_database, "op_summary"): + static_shape_operators = profiling_database.op_summary.get_static_shape_operators() + if len(static_shape_operators) == 0: + return True + else: + logger.warning( + "Skip dynamic shape check because of not containing op_summary.csv file in current filefloder." + ) + return False + + def format_operator_result(self, record, limit): + """ + Format operator result to html + :param record: profiling check record + :param limit: Limit number of operator statistics lists. + :return: + """ + optimization_item = record.optimization_item + release_suggestion_list = [] + for suggestion in optimization_item.suggestion: + release_suggestion = copy.deepcopy(suggestion) + if release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION: + release_suggestion += \ + (f"for details please refer to link : LINK") + elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION: + release_suggestion += \ + (f"\nThe config file for MSLite AOE usage is as follows:\n" \ + f"[ascend_context]\n" \ + f"aoe_mode=\"operator tuning\"\n" \ + f"--tune_ops_file={Config().tune_ops_file}\n" + f"\nFor details please refer to link : LINK") + release_suggestion_list.append(release_suggestion.replace('\n', '
')) + format_result = {"record": record.__dict__, + "suggestion": fill('
'.join(release_suggestion_list), width=200), + "task_duration": round(record.statistics_item.task_duration, 2)} + statistic = self.group_by(copy.deepcopy(self._op_list), limit=limit) + format_result["statistic"] = statistic + return format_result + + def group_by(self, op_list, op_key="op_type", + limit: int = constant.OPERATOR_LIST_UNLIMIT): + """ + group by Profiling.OpInfo's attribute key, then return top limit tuple by duration + :param op_list: input a OpInfo list + :param op_key: group by Profiling.OpInfo's attribute key + :param limit: top limit num, if you do not need to limit the length of tuple, input -1(int) + :return: + """ + if op_list is None: + op_list = [] + statistic = {} # str, json + for op_info in op_list: + if statistic.get(op_info.get_attr(op_key)): + statistic[op_info.get_attr(op_key)]["summary"]["total_duration"] = float( + statistic[op_info.get_attr(op_key)]["summary"]["total_duration"]) + float( + op_info.get_attr("task_duration", constant.DEFAULT_DURATION_ZERO)) + statistic[op_info.get_attr(op_key)]["summary"]["counts"] += 1 + stack_info = op_info.get_attr("stack_info") + if stack_info: + op_info.stack_info = stack_info.replace('\r\n', '
') + statistic[op_info.get_attr(op_key)]["op_info_list"].append(op_info) + else: + statistic[op_info.get_attr(op_key)] = {"summary": {}, "op_info_list": []} + statistic[op_info.get_attr(op_key)]["summary"]["op_type"] = op_info.get_attr( + "op_type", constant.DEFAULT_OPERATOR_TYPE) + statistic[op_info.get_attr(op_key)]["summary"]["total_duration"] = float( + op_info.get_attr("task_duration", constant.DEFAULT_DURATION_ZERO)) + statistic[op_info.get_attr(op_key)]["summary"]["counts"] = 1 + stack_info = op_info.get_attr("stack_info") + if stack_info: + op_info.stack_info = stack_info.replace('\r\n', '
') + statistic[op_info.get_attr(op_key)]["op_info_list"] = [op_info] + + if statistic: + for op_key in statistic.keys(): + statistic[op_key]["summary"]["total_duration"] = round( + statistic[op_key]["summary"]["total_duration"], 2) + # Grouped by op_type, sorted by total_duration, and obtained the top 10 operators that take the most time. + if limit > 0: + statistic = sorted( + statistic.items(), key=lambda kv: kv[1]["summary"]["total_duration"], reverse=True)[:limit] + else: + statistic = sorted(statistic.items(), key=lambda kv: kv[1]["summary"]["total_duration"], reverse=True) + else: + logger.warning("%s checker do not has results to format html", str(self.__class__.__name__)) + return statistic + + def _check_data(self, profiling_data): + return True + + def _check_operator(self, op_info): + return False + + def _get_income(self, _op_info: OpInfo) -> float: + return 0 + + def get_tune_op_list(self): + """ + get tune op list + :return: tune op list + """ + return self._tune_op_list + + def get_views(self, _graph_data): + """Get node views.""" + return [] + + @classmethod + def get_name(cls): + """ + get name of checker + :return: checker name + """ + return cls._PROBLEM + + def get_incomes(self) -> float: + """get incomes""" + incomes = 0.0 + for op_info in self._op_list: + income = self._get_income(op_info) + setattr(op_info, "income", round(income, 2)) + incomes += income + return incomes + + def get_op_type_list(self, op_list: List[OpInfo]): + """get op type list""" + op_type_list = [] + for op_info in op_list: + if op_info.op_type not in op_type_list: + op_type_list.append(op_info.op_type) + return op_type_list + + def _check_summary(self, data: ProfilingDataset): + if not hasattr(data, "op_summary"): + logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "op summary") + return False + return True + + @staticmethod + def get_ratio(op_info: OpInfo, attr: str) -> float: + if not op_info.has_attr(attr): + return 0 + value = op_info.get_attr(attr) + if not value or value == "N/A": + return 0 + return float(value) + + def get_details(self) -> list: + """ + get details of operator to be optimized + :return: detail list + """ + op_list = self._op_list + if not op_list or not (self._ITEMS + [self.STACK_INFO_ITEMS]): + return [] + details = [] + attrs = [attr for attr in (self._ITEMS + [self.STACK_INFO_ITEMS]) if op_list[0].has_attr(attr)] + details.append(attrs) + op_list = sorted(op_list, key=lambda x: float(x.get_attr("task_duration")), reverse=True) + for op_info in op_list: + content = [ + op_info.get_attr(attr) if attr != "aicore_time" + else op_info.get_float_attr(attr, strict_mode=True) + + op_info.get_float_attr("aiv_time", strict_mode=True) for attr in attrs + ] + details.append(content) + return details + + def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: + if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER: + self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) + elif profiling_data.PROF_TYPE == constant.MSLITE: + self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) diff --git a/profiler/advisor_review/analyzer/computation/profiling_analyzer.py b/profiler/advisor_review/analyzer/computation/profiling_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..8682617700702055628a31982b0eafab9feb336d --- /dev/null +++ b/profiler/advisor_review/analyzer/computation/profiling_analyzer.py @@ -0,0 +1,89 @@ +import logging +from abc import ABC +from typing import Dict, List + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common import constant +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.analyzer.computation.aicpu.aicpu_checker import AicpuChecker +from profiler.advisor.analyzer.computation.bound.block_dim_checker import BlockDimChecker +from profiler.advisor.analyzer.computation.bound.operator_bound_checker import OperatorBoundChecker +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.analyzer.computation.op_compile.dynamic_shape_checker import DynamicShapeChecker +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.utils.utils import get_supported_subclass + +logger = logging.getLogger() + + +class ProfilingAnalyzer(BaseAnalyzer, ABC): + dataset_cls_list = [ProfilingDataset] + + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = OperatorChecker(self.cann_version) + self.html_render = HTMLRender() + self.result = OptimizeResult() + + @BaseAnalyzer.check_data((ProfilingDataset.get_key(),)) + def optimize(self, **kwargs) -> OptimizeResult: + """ + optimize operator + :param data: input datasets + :return: result + """ + profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key()) + checker = self.checker + if not checker.pre_check(profiling_data): + return self.result + if checker.check(profiling_data): + # add record + record = checker.make_record(profiling_data) + checker.make_render(self.html_render, record) + self.result.add(record) + # add details + details = checker.get_details() + if details: + for i, detail in enumerate(details): + if i == 0: + # the first row is header + self.result.add_detail(checker.get_name(), headers=detail) + else: + self.result.add_detail(checker.get_name(), detail=detail) + # add tune op list + tune_op_list = checker.get_tune_op_list() + if tune_op_list: + self.result.add_tune_op_list(tune_op_list) + + return self.result + + def make_record(self): + pass + + def make_render(self): + pass + + +class DynamicShapeAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = DynamicShapeChecker(self.cann_version) + + +class BlockDimAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = BlockDimChecker(self.cann_version) + + +class OperatorBoundAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = OperatorBoundChecker(self.cann_version) + +class AicpuAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = AicpuChecker(self.cann_version) \ No newline at end of file diff --git a/profiler/advisor_review/analyzer/dataloader/__init__.py b/profiler/advisor_review/analyzer/dataloader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/graph_fusion/__init__.py b/profiler/advisor_review/analyzer/graph_fusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..326be83b8d49088b1563ccd8c08b68a4aa3001ef --- /dev/null +++ b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_analyzer.py @@ -0,0 +1,49 @@ +from typing import List +from functools import partial + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.dataset.graph_dataset import GraphDataset +from profiler.advisor.analyzer.graph_fusion.graph_fusion_checker import GraphFusionRules +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.display.html.render import HTMLRender + + +class FusionOPAnalyzer(BaseAnalyzer): + """ + fusion optimizer + """ + RULES = dict(graph_dataset=partial(GraphFusionRules, "rules/op_fusion_pass.yaml")) + dataset_cls_list = [GraphDataset, ProfilingDataset] + + def __init__(self, collection_path, **kwargs) -> None: + super(FusionOPAnalyzer, self).__init__(collection_path, **kwargs) + self.result = OptimizeResult() + self.html_render = HTMLRender() + + @BaseAnalyzer.check_data((GraphDataset.get_key(),)) + def optimize(self, **kwargs): + """ + :return: result + """ + self._check(self.dataset_list.get("GraphDataset"), self.dataset_list.get("ProfilingDataset")) + return self.result + + def _check(self, graph_data: List[GraphDataset], + profiling_data: List[ProfilingDataset] = None) -> None: + if len(graph_data) == 0 or graph_data[0].is_empty(): + return + for _, rule in self.RULES.items(): + checker = rule() + if profiling_data is None: + checker.find_fusion_matched_issues(graph_data) + else: + checker.find_fusion_matched_issues_with_times(graph_data, profiling_data) + checker.make_record(self.result) + checker.make_render(self.html_render) + + def make_record(self): + pass + + def make_render(self): + pass diff --git a/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..e64020fdfe2ace37172e82ed562db1b66971d3d6 --- /dev/null +++ b/profiler/advisor_review/analyzer/graph_fusion/graph_fusion_checker.py @@ -0,0 +1,207 @@ +import logging +from typing import List + +from tqdm import tqdm + +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord, StatisticsItem +from profiler.advisor.common.graph.graph import Graph +from profiler.advisor.common.graph.graph_parser import QueryGraphParser +from profiler.advisor.dataset.graph_dataset import GraphDataset +from profiler.advisor.common.graph.graph_match import find_isomorphisms + +logger = logging.getLogger() + + +class GraphFusionRules: + def __init__(self, fusion_rules: str): + self.fusion_rules = fusion_rules + self.candidates = [] + self.task_duration_list = [] + + @staticmethod + def build_query_graph(query_graphs) -> List[Graph]: + for _, query_graph in query_graphs.fusion_rules.items(): + for sub_graph in query_graph: + graph = Graph(*sub_graph) + graph.build() + yield graph + + def find_fusion_matched_issues(self, graphs: List[GraphDataset]): + query_graphs = QueryGraphParser(self.fusion_rules) + with tqdm(total=query_graphs.num_rules, leave=False, ncols=100, unit=" rules") as pbar: + pbar.set_description(f"Searching Isomorphic Subgraph") + for query_graph in self.build_query_graph(query_graphs): + query_candidates = find_isomorphisms(query_graph.graph, graphs[0].graphs[-1].graph) + pbar.update(1) + if len(query_candidates) > 0: + self.candidates.append(query_candidates) + + def find_fusion_matched_issues_with_times(self, graphs: List[GraphDataset], profiling): + self.find_fusion_matched_issues(graphs) + if len(self.candidates) == 0 or len(profiling) == 0: + return + + if not hasattr(profiling[0], 'op_summary') or profiling[0].op_summary is None: + if hasattr(profiling[0], 'msprof'): + self.match_time_from_msprof(profiling[0].msprof) + return + else: + logger.warning("Skip analyze operator because of not containing op summary.") + return + + self.match_time_from_summary(profiling[0].op_summary) + time_duration_sum = [] + for task_duration in self.task_duration_list: + time_duration_sum.append(sum([sum(duration) for duration in task_duration])) + time_duration_index = sorted(range(len(time_duration_sum)), + key=time_duration_sum.__getitem__, + reverse=True) + self.task_duration_list = [self.task_duration_list[i] for i in time_duration_index] + self.candidates = [self.candidates[i] for i in time_duration_index] + + def match_time_from_summary(self, op_summary): + op_dict = op_summary.task_dict + for candidates in self.candidates: + candidate_duration = [] + for candidate in candidates: + duration_list = [] + for node in candidate.values(): + if node.op_name not in op_dict or op_dict[node.op_name][0].op_type.lower() != node.op_type.lower(): + logger.warning("Operator %s is missing in op summary, which will be set to 0.", node.op_name) + duration_list.append(0.0) + continue + duration_list.append(float(op_dict[node.op_name][0].task_duration)) + candidate_duration.append(duration_list) + self.task_duration_list.append(candidate_duration) + + def match_time_from_msprof(self, msprof): + op_dict = dict() + for task in msprof.tasks: + if "item_id" not in task.args: + continue + op_dict[task.args["item_id"]] = {"task_duration": task.dur} + for candidates in self.candidates: + candidate_duration = [] + for candidate in candidates: + duration_list = [] + for node in candidate.values(): + if node.op_name not in op_dict: + logger.warning("Operator %s is missing in msprof, which will be set to 0.", node.op_name) + duration_list.append(0.0) + continue + duration_list.append(float(op_dict[node.op_name].get("task_duration"))) + candidate_duration.append(duration_list) + self.task_duration_list.append(candidate_duration) + + def make_render(self, html_render): + if not self.candidates: + return + + candidates_list = [] + for case_id, nodes in enumerate(self.candidates): + candidate_dict = dict() + candidate_dict['counts'] = len(nodes) + candidate_dict['matches'] = [] + has_time_info = False + if self.task_duration_list: + has_time_info = True + candidate_dict['total_duration'] = round(sum(sum(duration) for duration in + self.task_duration_list[case_id]), 2) + for node_index, refer_node in enumerate(nodes): + match = [] + index = 0 + pass_name = ','.join(item.op_type for item in refer_node.keys()) + for query_node, host_node in refer_node.items(): + fusion_pattern = query_node.op_pass + + if 'op_pass' not in candidate_dict: + candidate_dict['op_pass'] = fusion_pattern + if 'fusion_pattern' not in candidate_dict: + candidate_dict['fusion_pattern'] = pass_name + match_attr = dict() + match_attr['op_name'] = host_node.op_name + match_attr['dtype'] = query_node.op_type + if has_time_info: + match_attr['duration'] = round(self.task_duration_list[case_id][node_index][index], 2) + index += 1 + match.append(match_attr) + match_attr = dict() + match_attr['op_name'] = "-" + match_attr['dtype'] = "-" + if has_time_info: + match_attr['duration'] = round(sum(self.task_duration_list[case_id][node_index]), 2) + match.append(match_attr) + candidate_dict['matches'].append(match) + candidates_list.append(candidate_dict) + html_render.render_template(key="computation", + template_dir="templates", + template_name="fusion.html", + candidates=candidates_list) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.candidates: + return + + optimization_item = OptimizeItem( + "fusion issue", + f"Found {len(self.candidates)} fusion issues", + ["Check fusion issues detail in att_advisor*.html"] + ) + total_time = 0.0 + for candidate in self.task_duration_list: + for duration in candidate: + total_time += sum(duration) + statistics_item = StatisticsItem(0, + total_time, + sum([len(candidate) for candidate in self.candidates]) + ) + result.add(OptimizeRecord(optimization_item, statistics_item)) + + record_title = [ + "issue_id", "graph_name", "op_name", "fusion_structure", "fusion_pattern", + "op_type", "input_shape", "input_format", + "input_dtype", "output_shape", "output_format", "output_dtype" + ] + result.add_detail('fusion issues', headers=record_title) + + for case_id, nodes in enumerate(self.candidates): + for _, refer_node in enumerate(nodes): + pass_name = ','.join(item.op_type for item in refer_node.keys()) + for query_node, host_node in refer_node.items(): + fusion_pattern = query_node.op_pass + detail = [ + case_id, + host_node.graph_name, + host_node.op_name, + pass_name, + fusion_pattern, + query_node.op_type, + self.get_attr_shape(host_node, "input", "shape"), + self.get_attr_type(host_node, "input", "format"), + self.get_attr_type(host_node, "input", "dtype"), + self.get_attr_shape(host_node, "output", "shape"), + self.get_attr_type(host_node, "output", "format"), + self.get_attr_type(host_node, "output", "dtype"), + ] + result.add_detail('fusion issues', detail=detail) + + @staticmethod + def get_attr_shape(node, type_name: str, attr_name: str) -> str: + attr_shape = [] + node_attrs = getattr(node, type_name, []) + for attrs in node_attrs: + attr = getattr(attrs, attr_name, []) + attr_shape.append(",".join(attr)) + return ";".join(attr_shape) + + @staticmethod + def get_attr_type(node, type_name: str, attr_name: str) -> str: + attr_type = [] + node_attrs = getattr(node, type_name, []) + for attr in node_attrs: + attr_type.append(getattr(attr, attr_name, "")) + return ";".join(attr_type) diff --git a/profiler/advisor_review/analyzer/overall/__init__.py b/profiler/advisor_review/analyzer/overall/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/overall/overall_analyzer.py b/profiler/advisor_review/analyzer/overall/overall_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..916a396b3d096dc788954cbc8e8ba9755cd15f4e --- /dev/null +++ b/profiler/advisor_review/analyzer/overall/overall_analyzer.py @@ -0,0 +1,45 @@ +import logging +from typing import Dict, List + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.result.result import OptimizeResult +from profiler.compare_tools.compare_backend.utils.constant import Constant +from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface + +logger = logging.getLogger() + + +class OverallSummaryAnalyzer(BaseAnalyzer): + + def __init__(self, profiling_path, benchmark_profiling_path=None, **kwargs): + self.benchmark_profiling_path = benchmark_profiling_path or profiling_path + self.profiling_path = profiling_path + self.html_render = HTMLRender() + self.result = OptimizeResult() + + def optimize(self, **kwargs): + compare_result = ComparisonInterface(self.benchmark_profiling_path, self.profiling_path).compare( + Constant.OVERALL_COMPARE) + + headers = compare_result.get('Model Profiling Time Distribution').get("headers", []) + rows = compare_result.get('Model Profiling Time Distribution').get("rows", []) + + self.make_record() + self.make_render(headers=headers, rows=rows) + return compare_result + + def make_record(self): + pass + + def make_render(self, **kwargs): + headers = kwargs.get("headers") + rows = kwargs.get("rows") + + if not headers or not rows: + logger.info("Empty headers or rows, skip render overall analysis html") + self.html_render.render_template(key="overall", + template_dir="templates", + template_name="overall_analysis.html", + headers=kwargs.get("headers"), + rows=kwargs.get("rows")) diff --git a/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..c74ae0510331fb9ba8a1794bd724710ba19cfabf --- /dev/null +++ b/profiler/advisor_review/analyzer/overall/overall_summary_analyzer.py @@ -0,0 +1,262 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import copy + +import logging +from typing import Dict, List + +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.compare_tools.compare_backend.utils.constant import Constant +from profiler.advisor.common import constant as const +from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface +from profiler.advisor.utils.utils import get_file_path_from_directory, load_parameter + + +class OverallSummaryAnalyzer(BaseAnalyzer): + OVERALL_SUMMARY_ANALYZER = "overall_summary_analysis" + advice_map = { + "Computing Time": "if you want more detailed advice please go to att_advisor_*.html", + "Uncovered Communication Time": "if you want more detailed advice please go to att_advisor_*.html", + "Free Time": "if you want more detailed advice please go to att_advisor_*.html" + } + time_name_map = { + "Computing Time": "computing", + "Uncovered Communication Time": "communication", + "Free Time": "free", + 'Cube Time(Num)': 'Cube Time', + 'Vector Time(Num)': 'Vector Time', + 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', + 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', + 'Other Time': "Other Computing Time", + 'SDMA Time(Num)': 'SDMA Time' + } + performance_time_dict = { + "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', + 'Flash Attention Time(Backward)(Num)', 'Other Time'], + "Uncovered Communication Time(Wait Time)": [], + "Free Time": ['SDMA Time(Num)'] + } + + def __init__(self, collection_path: str, n_processes: int = 1, **kwargs): + profile_path = get_profile_path(collection_path) + super().__init__(profile_path, n_processes, **kwargs) + self.base_collection_path = kwargs.get("base_collection_path", "") + self._has_base_collection = False + self._is_minimal_profiling = False + self.cur_data = {} + self.cur_data_table = {} + self.cur_bottleneck = {} + self.cur_advices = "" + self._headers = [] + self._base_data = [] + self._comparison_data = [] + self.html_render = HTMLRender() + self.result = OptimizeResult() + self.bottleneck_str = "" + self.bottleneck_table = {} + + @staticmethod + def split_duration_and_num(time_value: str) -> tuple: + split_data = time_value.split("s") # time value example: 0.229s(1756) + duration, num = 0.0, None + if len(split_data) >= 2: + try: + num = int(split_data[1].strip("()")) + except ValueError: + pass + if len(split_data) >= 1: + try: + duration = float(split_data[0]) + except ValueError: + print(f"[WARNING] Invalid time value: {time_value}.") + return duration, num + + @staticmethod + def calculate_ratio(dividend, divisor): + if not divisor: + return float("inf") + return dividend / divisor + + def path_check(self): + if self.base_collection_path: + if os.path.exists(self.base_collection_path): + self._has_base_collection = True + else: + print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.") + return os.path.exists(self.collection_path) + + def process(self): + base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path + result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE) + for data in result_data.values(): + self._headers = data.get("headers", []) + rows = data.get("rows", []) + if len(rows) == 2: + self._base_data = rows[0] + self._comparison_data = rows[1] + if not self._headers or not self._comparison_data: + return + self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers + if self._has_base_collection: + self.cur_data["comparison_result"] = result_data + time_category_dict = {} + for time_category, time_list in self.performance_time_dict.items(): + time_value = self.get_time_value(time_category, self._comparison_data) + if time_value == Constant.INVALID_VALUE: + continue + duration, _ = self.split_duration_and_num(time_value) + time_category = time_category.split("(")[0] + time_category_dict[time_category] = duration + self.get_sub_category_time(time_category, time_list, duration) + self.cur_data["overall_data"] = time_category_dict + + def get_time_value(self, header_name: str, data_list: list): + try: + data_index = self._headers.index(header_name) + except ValueError: + return Constant.INVALID_VALUE + try: + time_value = data_list[data_index] + except IndexError: + return Constant.INVALID_VALUE + return time_value + + def get_sub_category_time(self, category: str, time_list: list, total_duration: float): + sub_time_dict = {} + for time_name in time_list: + time_value = self.get_time_value(time_name, self._comparison_data) + if time_value == Constant.INVALID_VALUE: + continue + sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, "")) + duration, num = self.split_duration_and_num(time_value) + sub_time_dict.setdefault(f"Duration(s)", []).append(duration) + sub_time_dict.setdefault(f"Duration Ratio", []).append( + "{:.2%}".format(self.calculate_ratio(duration, total_duration))) + sub_time_dict.setdefault(f"Kernel Number", []).append(num) + self.cur_data[self.time_name_map.get(category)] = sub_time_dict + + def identify_bottleneck(self): + overall_data = self.cur_data.get("overall_data") + if not overall_data: + return + e2e_time = '%.3f' % sum([data for data in overall_data.values()]) + overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n" + comparison_bottleneck = "" + for time_type, time_value in overall_data.items(): + # add subtype time bottleneck + self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n" + # add overall bottleneck + overall_bottleneck += f" -- {time_type} is {time_value}s\n" + if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value, + e2e_time) > 0.1: + overall_bottleneck += "percentage of free time exceed the threshold 10%." + if not self._has_base_collection: + continue + # add comparison bottleneck + time_type_origin = "Uncovered Communication Time(Wait Time)" \ + if time_type == "Uncovered Communication Time" else time_type + base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data)) + if time_value > base_duration: + ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration)) + comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n" + self.cur_bottleneck["overall_data"] = overall_bottleneck + if comparison_bottleneck: + self.cur_bottleneck["comparison_result"] = comparison_bottleneck + def optimize(self, **kwargs): + if self.path_check(): + self.process() + self.identify_bottleneck() + self.format_bottleneck() + self.format_cur_data() + self.make_record() + self.make_render() + return self.result + + def format_bottleneck(self): + result = '' + headers = [] + data_list = [] + data = [] + for key, value in self.cur_bottleneck.items(): + if not value: + continue + result += f'{key}: {value} \n' + headers.append(key) + data.append(value) + data_list.append(data) + self.bottleneck_str = result + self.bottleneck_table["headers"] = headers + self.bottleneck_table["data"] = data_list + + def format_cur_data(self): + if not self.cur_data: + return + for data_type, data in self.cur_data.items(): + if not data: + continue + if data_type not in list(self.time_name_map.values()): + data_list = list(data.values()) + else: + data_list = [','.join(map(str, value)) for value in data.values()] + headers = list(data.keys()) + data_table = {"headers": headers, "data": [data_list]} + self.cur_data_table[data_type] = copy.deepcopy(data_table) + + + def make_record(self): + """ + make record for what and how to optimize + """ + if not self.bottleneck_str and not self.cur_advices: + return + optimization_item = OptimizeItem( + OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, + self.bottleneck_str, + self.cur_advices + ) + self.result.add(OptimizeRecord(optimization_item)) + + self.result.add_detail(const.BOTTLENECK, self.bottleneck_table["headers"], self.bottleneck_table["data"][0]) + for data_type, data_dict in self.cur_data_table.items(): + if data_dict: + self.result.add_detail(const.DATA + data_type, data_dict["headers"], data_dict["data"][0]) + + def make_render(self): + if not self.bottleneck_str and not self.cur_advices: + return + result_for_html = { + "Description" : self.bottleneck_str, + "suggestion" : self.cur_advices, + "details" : [self.bottleneck_table] + } + + self.html_render.render_template(key="overall", + title=OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, + template_dir="templates", + template_name="cluster_analysis.html", + cann_version=self.cann_version, + torch_version=self.torch_version, + result=result_for_html) + +def get_profile_path(collection_path): + for root, dirs, files in os.walk(collection_path): + for file in files: + if file.startswith("profiler_info"): + return root + return "" \ No newline at end of file diff --git a/profiler/advisor_review/analyzer/schedule/__init__.py b/profiler/advisor_review/analyzer/schedule/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/schedule/dispatch/__init__.py b/profiler/advisor_review/analyzer/schedule/dispatch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..0e62a3ff0c8eebc0cf7b5b89953b8a0842df9c9d --- /dev/null +++ b/profiler/advisor_review/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py @@ -0,0 +1,107 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + + +from profiler.advisor.common import constant as const +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.display.html.render import HTMLRender + +logger = logging.getLogger() + + +class OpDispatchAnalyzer(BaseAnalyzer): + dataset_cls_list = [TimelineEventDataset] + """ + operator dispatch optimizer + """ + + def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: + super().__init__(collection_path, n_processes, **kwargs) + key = TimelineEventDataset.get_key() + self.dataset = self.get_first_data_by_key(self.dataset_list, key) + self.result = OptimizeResult() + self.html_render = HTMLRender() + self._op_compile = None + self._issues_record = [] + self.optimization_item = [] + + def optimize(self, **kwargs): + """ + optimize operator + :param data: input datasets + :return: result + """ + self.get_op_compile_info(self.dataset) + self.make_record(self.result) + self.make_render(self.html_render) + return self.result + + def get_op_compile_info(self, event_dataset: TimelineEventDataset): + """ + :Param event_dataset: dataset of timeline event + """ + if hasattr(event_dataset, "ops_compile"): + self._op_compile = getattr(event_dataset, "ops_compile") + if not self._op_compile or self._op_compile.total_count < const.MAX_OP_COMPILE_NUM: + return + + self._issues_record.append(['operator dispatch', + const.OP_COMPILE_ID, + self._op_compile.total_count, + self._op_compile.total_time]) + else: + logger.debug("Skip operator compile checker, because no op_compile attr find.") + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self._op_compile or len(self._issues_record) <= 0: + return + desc = f"Found {self._op_compile.total_count} operator compile issues." + suggestion = (f"Please use `torch_npu.npu.set_compile_mode(jit_compile=False)` to disable jit compile " + f"in dynamic shape usage.") + self.optimization_item.append(OptimizeItem("Operator dispatch", desc, [suggestion])) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + record_title = ["Issues", "op name", "counts", "total time"] + result.add_detail('operator dispatch', headers=record_title) + for op_info in self._issues_record: + result.add_detail('operator dispatch', detail=op_info) + + def make_render(self, html_render): + issues = [] + optimizations = [] + for optimization in self.optimization_item: + optimizations.append(dict( + description=optimization.description, + suggestion=optimization.suggestion[0] + )) + for record in self._issues_record: + issues.append(dict(issue=record[0], + op_name=record[1], + counts=record[2], + total_time=record[3])) + html_render.render_template(key="schedule", + template_dir="templates", + template_name="operator_dispatch.html", + issues=issues, + optimizers=optimizations) diff --git a/profiler/advisor_review/analyzer/schedule/free_event/__init__.py b/profiler/advisor_review/analyzer/schedule/free_event/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/__init__.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..c1eb24b8e1e11ac167a7eb9333867167a57dd524 --- /dev/null +++ b/profiler/advisor_review/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -0,0 +1,271 @@ +import multiprocessing +import logging +import re + +from tqdm import tqdm + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common import constant as const +from profiler.advisor.common.analyzer_scopes import SupportedScopes +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.utils.utils import format_timeline_result +from profiler.advisor.common.timeline.fusion_ops_db import init_timeline_ops_db + +logger = logging.getLogger() + + +class TimelineFusionOpsAnalyzer(BaseAnalyzer): + dataset_cls_list = [TimelineEventDataset] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) + self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict() + self.matched_op_stacks = {} + self.empty_stacks = True + key = TimelineEventDataset.get_key() + self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key) + + def optimize(self, **kwargs): + for mode in [const.ATEN.lower(), const.OPTIMIZER.lower()]: + + for op_combined, npu_apis in tqdm(getattr(init_timeline_ops_db(self.cann_version, self.torch_version), + f"_{mode}_op_api_map").items(), leave=False, ncols=100, + desc="Scanning timeline for affinity apis"): + for npu_api in npu_apis.split("/"): + self.find_fusion_ops(self.timeline_event_dataset, op_combined, npu_api, mode) + + self.query_stack(self.timeline_event_dataset) + + logger.info("Finish timeline analysis") + self.make_record() + self.make_render() + return self.result + + def find_fusion_ops(self, event_dataset, ops: str, npu_api: str, mode: str): + """ + :Param event_dataset: dataset of timeline event + :Param ops: operator combination with '-' as separator , e.g. permute-reshape + :Param npu_api: api of torch_npu, generally more efficient than torch api + :Param mode: aten or dequeue or optimizer + :Return: json of op_name and called times and detail stacks + """ + op_rule_pattern, enable_regex = self._format_rule_to_pattern(ops) + if not enable_regex: + self._match_ops(event_dataset, op_rule_pattern, npu_api, mode) + else: + try: + self._match_ops_with_regex(event_dataset, op_rule_pattern, npu_api, mode) + except Exception as e: + logger.warning("Failed to find fusion operators with regex %s, reason is %s", ops, e) + + def _match_ops(self, event_dataset, ops: str, npu_api: str, mode: str): + """ match operator based on fusion operators rule(without regex), + only strictly equals of op name list means matched + :Param event_dataset: dataset of timeline event + :Param ops: operator combination with '-' as separator , e.g. permute-reshape + :Param npu_api: api of torch_npu, generally more efficient than torch api + :Param mode: aten or dequeue or optimizer + """ + op_list = ops.split(const.OP_SEP) + + matched_op_index = set() + api_ops_matched = False + + for index, event in enumerate(getattr(event_dataset, mode)): + if self._replace_op_name_prefix(event.name, mode) != op_list[0]: + continue + tmp_dequeue_event_names = [self._replace_op_name_prefix(event.name, mode) for event in + getattr(event_dataset, mode)[index: index + len(op_list)]] + if tmp_dequeue_event_names != op_list: + continue + api_ops_matched = True + matched_op_index.add(event.dataset_index) + + if api_ops_matched: + self._matched_op_index[npu_api + f":{ops}"] = matched_op_index + + def _match_ops_with_regex(self, event_dataset, op_rule_pattern: str, npu_api: str, + mode: str): + """ match operator based on fusion operators rule(with regex), + using regex to support condition like 'a = torch.mul(xxx) if xxx else torch.add(xxx)' + :Param event_dataset: dataset of timeline event + :Param op_rule_pattern: fusion operators rule with regex definition , e.g. add-mul{0,10}, add-mul* + :Param npu_api: api of torch_npu, generally more efficient than torch api + :Param mode: aten or dequeue or optimizer + """ + matched_op_index = set() + total_op_name = "".join([f"{const.OP_SEP}{self._replace_op_name_prefix(event.name, mode)}{const.OP_SEP}" + for event in + getattr(event_dataset, mode)]) + + matched_pattern_index_tuple = [(x.start(0), x.end(0)) for x in re.finditer(op_rule_pattern, total_op_name)] + # convert list of index tuple to a whole list: [(3, 25), ...] -> [3, 25, ...] + total_ops_split_points = [num for sublist in matched_pattern_index_tuple for num in sublist] + + api_ops_matched = len(total_ops_split_points) != 0 + + op_index = [] + if 0 not in total_ops_split_points: + total_ops_split_points = [0] + total_ops_split_points + if len(list(total_op_name)) not in total_ops_split_points: + total_ops_split_points.append(len(list(total_op_name))) + + # convert total ops name like "-add-mul-xxx-div-" to small pieces like [["add", "mul"], [...], ["div"]] + # by the regex index and then calculate the real index for matched fusion operators in event dataset + for l, r in zip(total_ops_split_points, total_ops_split_points[1:]): + matched_op_flag = True if (l, r) in matched_pattern_index_tuple else False + matched_ops_list = total_op_name[l: r].strip(const.OP_SEP).split(const.OP_SEP + const.OP_SEP) + op_index.append([matched_op_flag, len(matched_ops_list)]) + for i, _ in enumerate(op_index): + if i > 0: + # calculate cumsum for indexing matched operator + op_index[i][1] = op_index[i][1] + op_index[i - 1][1] + op_index = [[False, 0]] + op_index + + for i, _ in enumerate(op_index): + if not op_index[i][0]: + continue + index = op_index[i - 1][1] + matched_op_index.add(index) + + if index > len(getattr(event_dataset, mode)) - 1: + continue + dataset_index = getattr(event_dataset, mode)[index].get("dataset_index") + matched_op_index.add(dataset_index) + + if api_ops_matched: + self._matched_op_index[npu_api + f":{op_rule_pattern}"] = sorted(list(matched_op_index)) + + def make_record(self): + """ + make record for what and how to optimize + """ + if not self.matched_op_stacks: + return + + desc = f"Found {len(format_timeline_result(self.matched_op_stacks))} apis to be replaced" \ + f" based on the runtime env cann-{self.cann_version} and torch-{self.torch_version}" + suggestion = "Please replace training api according to sub table 'Affinity training api'" + if self.empty_stacks: + desc += ", but with no stack" + suggestion = const.TIMELINE_EMPTY_STACKS_PROMPT.format( + timeline_profiling_doc_url=const.TIMELINE_WITH_STACK_DOC_URL + ) + + optimization_item = OptimizeItem( + SupportedScopes.TIMELINE_FUSION_OPS, + desc, + [suggestion] + ) + + self.result.add(OptimizeRecord(optimization_item)) + + record_title = ["Affinity API", "Code stacks", "Stack called counts"] + self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, headers=record_title) + + for api_name, stacks_info in format_timeline_result(self.matched_op_stacks).items(): + if not stacks_info: + detail = [api_name, "null", "null"] + self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail) + else: + for stack in stacks_info: + detail = [api_name, *stack] + self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail) + + def make_render(self): + format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) + + self.html_render.render_template(key="schedule", + template_dir="templates", + template_name="affinity_api.html", + cann_version=self.cann_version, + torch_version=self.torch_version, + empty_stacks=self.empty_stacks, + with_stack_doc_url=const.TIMELINE_WITH_STACK_DOC_URL, + api_doc_url=const.TIMELINE_API_DOC_URL, + result=format_result_for_html) + + def query_stack(self, event_dataset): + if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]): + return + + op_stack_list = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index) + for op_stack in op_stack_list: + for op_rule, stack in op_stack.items(): + if op_rule not in self.matched_op_stacks: + self.matched_op_stacks[op_rule] = {} + if stack == const.TIMELINE_FUSION_OPS_NO_STACK_FLAG: + continue + if stack not in self.matched_op_stacks[op_rule]: + self.matched_op_stacks[op_rule][stack] = 0 + self.matched_op_stacks[op_rule][stack] += 1 + + def _query_stack_by_matched_index(self, index, event): + stack_record = {} + event = TimelineEvent(event) + + matched_op_rules = [] + for op_rule, matched_index in self._matched_op_index.items(): + if index not in matched_index: + continue + + matched_op_rules.append(op_rule) + stack = event.args.get(const.CALL_STACKS) + + if not stack: + logger.debug("Got empty '%s' for event %s", const.CALL_STACKS, event) + continue + + if self.empty_stacks and stack: + self.empty_stacks = False + + stack_record[op_rule] = stack + + if matched_op_rules and not stack_record: + for op_rule in matched_op_rules: + stack_record[op_rule] = const.TIMELINE_FUSION_OPS_NO_STACK_FLAG + + return stack_record + + def _replace_op_name_prefix(self, event_name, mode): + if mode == const.DEQUEUE.lower(): + op_name_prefix = f"{const.DEQUEUE}{const.DEQUEUE_SEP}" + elif mode == const.ATEN: + op_name_prefix = f"{const.ATEN}{const.ATEN_SEP}" + else: + op_name_prefix = f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}" + + return event_name.replace(op_name_prefix, "") + + def _format_rule_to_pattern(self, op_rule): + """ + Args: + op_rule: like (mul){0,1}-(add|neg){0,2}-dropout-(softmax)* + + Returns: op_pattern like (-mul-){0,1}(-add-|-neg-){0,2}(-dropout-)(-softmax-)* + """ + enable_regex = False + if "(" not in op_rule and ")" not in op_rule: + # op_rule which requires fuzzy matching mush consist of "()" + return op_rule, enable_regex + + enable_regex = True + op_pattern_list = op_rule.split(const.OP_SEP) + format_op_pattern = "" + for op_pattern in op_pattern_list: + matched_res = re.search(r'\((.*?)\)', op_pattern) + + ops_index_range = (matched_res.start() + 1, matched_res.end() - 1) if matched_res else ( + 0, len(op_pattern)) + + op_names = op_pattern[ops_index_range[0]: ops_index_range[1]] + tmp_op_names_record = [] + for op_name in op_names.split("|"): + tmp_op_names_record.append(f"{const.OP_SEP}{op_name.strip(' ')}{const.OP_SEP}") + op_suffix = op_pattern[ops_index_range[1] + 1:] + op_names_format = f"({'|'.join(tmp_op_names_record)}){op_suffix}" + + format_op_pattern += op_names_format + return format_op_pattern, enable_regex diff --git a/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py b/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..f684a4892111f113f6c502a010c9e14ccd43768a --- /dev/null +++ b/profiler/advisor_review/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py @@ -0,0 +1,163 @@ +import logging +from typing import List + +from profiler.advisor.common import constant as const +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.utils.utils import get_analyze_processes, ParallelJob + +logger = logging.getLogger() + + +class OpStackFinder: + + def __init__(self): + self.n_processes = get_analyze_processes() + self._stack_record = [] + self._task_id_record = {} + self.op_name = None + self.task_type = None + self.matched_index = set() + + def get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: List[str] = None, task_type: str = None, + disable_multiprocess=False): + """ + :Param event_dataset: dataset of timeline event + :Param op_name: operator name, e.g. IndexPutV2 + :Param task_type: operator task type, optionals are AI_CPU and AI_CORE + :Param disable_multiprocess: disable multiprocessing, avoid cost time of enable new process for light task + """ + if not op_name: + op_name = [] + if not isinstance(op_name, list): + op_name = [op_name] + + self.op_name = ",".join(op_name) + self.task_type = task_type + op_name_list = event_dataset.task_op_names if not op_name else op_name + + if self.n_processes <= 1 or disable_multiprocess: + self._query_stacks_multiprocess(event_dataset, op_name_list, task_type) + else: + event_num_per_process = int(len(op_name_list) / self.n_processes) + 1 + parallel_analyzer = ParallelJob( + self._query_stacks_multiprocess, + [[event_dataset, op_name_list[i:i + event_num_per_process], task_type] + for i in range(0, len(op_name_list), event_num_per_process)], + job_name="Analyzing operator stacks from timeline" + ) + parallel_analyzer.start(self.n_processes) + self.query_stack(event_dataset) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self._stack_record: + return + + desc = f"Found {len(self._stack_record)} called stacks for" + if self.op_name and self.task_type: + desc += f" operators with name '{self.op_name}' with task type '{self.task_type}'" + elif self.op_name and not self.task_type: + desc += f" operators with name '{self.op_name}'" + elif self.task_type and not self.op_name: + desc += f" operators with task type '{self.task_type}'" + else: + desc += " all operators" + + suggestion = f"Please use command 'ma-advisor analyze profiling' to analyze operators" + optimization_item = OptimizeItem( + "Operator stacks", + desc, + [suggestion] + ) + result.add(OptimizeRecord(optimization_item)) + + record_title = ["Task ID", "op name", "op type", "code stacks"] + result.add_detail('operator stacks', headers=record_title) + + for op_info in self._stack_record: + result.add_detail('operator stacks', detail=op_info) + + def _get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: str, task_type: str): + for _, src_op_event in event_dataset.ops_with_task_type.items(): + + op_task_type = src_op_event.get(const.TASK_TYPE) + if not (src_op_event.name == op_name and op_task_type and op_task_type == task_type): + continue + + torch_to_npu_key = f"s-{src_op_event.tid}-{src_op_event.ts}" + torch_to_npu_event = event_dataset.torch_to_npu.get(torch_to_npu_key) or event_dataset.torch_to_npu.get( + f"s-{src_op_event.ts}") or event_dataset.torch_to_npu.get(f"s-{src_op_event.ts.replace('.', '')}") + + acl_to_npu_event = src_op_event.ts in event_dataset.acl_to_npu + + if not torch_to_npu_event and not acl_to_npu_event: + continue + + # query stack by torch_to_npu first, due to each operator had acl_to_npu incoming flow in cann6.3 + if torch_to_npu_event: + dst_op_index = self._query_index_by_torch_to_npu(event_dataset, torch_to_npu_event) + else: + dst_op_index = self._query_index_by_acl_to_npu(acl_to_npu_event) + + if not dst_op_index: + continue + + task_id = src_op_event.task_id + if not task_id: + continue + self.matched_index.add(dst_op_index) + if dst_op_index not in self._task_id_record: + self._task_id_record[dst_op_index] = [] + self._task_id_record[dst_op_index].append([task_id, op_name, task_type]) + + def _query_index_by_torch_to_npu(self, event_dataset, torch_to_npu_event): + dst_op_event_key = torch_to_npu_event.ts + dst_op_event = event_dataset.ops_with_stack.get(dst_op_event_key) + + if not dst_op_event: + return const.TIMELINE_BACKWARD_NO_STACK_CODE + + return dst_op_event.get("dataset_index") + + def _query_index_by_acl_to_npu(self, acl_to_npu_event): + if acl_to_npu_event: + return const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE + + def _query_stacks_multiprocess(self, event_dataset, op_name_list, task_type): + + for op_name in op_name_list: + if task_type is not None: + self._get_api_stack_by_op(event_dataset, op_name, task_type) + else: + self._get_api_stack_by_op(event_dataset, op_name, const.AI_CORE) + self._get_api_stack_by_op(event_dataset, op_name, const.AI_CPU) + + def _format_stack_record(self): + stack_list = [] + for task_id, stack_info in self._task_id_record.items(): + stack_list.append([task_id, *stack_info]) + return stack_list + + def _query_stack_by_matched_index(self, index, event): + if index not in self.matched_index: + return None + event = TimelineEvent(event) + stack = event.args.get(const.CALL_STACKS) + stack = stack if stack else const.NO_STACK_REASON_MAP.get(const.TIMELINE_BACKWARD_NO_STACK_CODE) + for matched_op_info in self._task_id_record.get(index, []): + self._stack_record.append([*matched_op_info, stack]) + + for matched_op_info in self._task_id_record.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE, []): + self._stack_record.append([*matched_op_info, + const.NO_STACK_REASON_MAP.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE)]) + return None + + def query_stack(self, event_dataset: TimelineEventDataset): + if not event_dataset.dataset_len: + return + _ = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index) diff --git a/profiler/advisor_review/cluster_perf_analysis.ipynb b/profiler/advisor_review/cluster_perf_analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7ee0b24e85467fe42205c5986095a7e66bf0a636 --- /dev/null +++ b/profiler/advisor_review/cluster_perf_analysis.ipynb @@ -0,0 +1,1042 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "initial_id", + "metadata": { + "ExecuteTime": { + "end_time": "2023-11-21T13:31:25.022339600Z", + "start_time": "2023-11-21T13:31:25.016155200Z" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c552da9d-36f9-43d3-ae1f-c54f78d3ff2d", + "metadata": {}, + "outputs": [], + "source": [ + "from profiler.advisor.interface.interface import Interface\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill" + ] + }, + { + "cell_type": "markdown", + "id": "57d17a21205c3c5e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# 集群调优分析\n", + "## 1. 集群分析的数据准备\n", + "首先我们当前支持PyTorch多卡大模型的集群分析,您需要输入集群分析的profiling_path路径,例如: \n", + "--{profiling_path} \n", + " -- xxxx_ascend_pt \n", + " -- xxxx_ascend_pt \n", + " -- xxxx_ascend_pt \n", + " ...... \n", + " -- xxxx_ascend_pt \n", + "里面每张卡的profiling文件都是ascend_pt结尾的文件。 \n", + "\n", + "## 2. 集群分析解决的问题 \n", + "当前的功能主要有四项: \n", + "1). 识别多卡间的计算慢卡(根据计算时间等推断) \n", + "2). 识别多卡间的通信慢现象(根据通信链路的带宽判断) \n", + "3). 对多卡间的计算算子进行统计展示(识别不同卡的算子差异) \n", + "4). 展示集群流水并行图(根据时间轴展示多卡间的计算和通信时间) " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "36b7a24cc7ca5da2", + "metadata": { + "ExecuteTime": { + "end_time": "2023-11-21T12:53:38.379699800Z", + "start_time": "2023-11-21T12:53:38.363755900Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# EDIT THE PROFILING DATA PATH\n", + "cluster_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=cluster_path)" + ] + }, + { + "cell_type": "markdown", + "id": "cf832ac2e0dfa30f", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## 1) 识别慢卡" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "40aac93278dd6e34", + "metadata": { + "ExecuteTime": { + "end_time": "2023-11-21T12:53:41.815599700Z", + "start_time": "2023-11-21T12:53:41.783393700Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO]Cluster has been analyzed because of the existence of cluster analysis output directory.\n", + "[INFO]Skip Cluster analyze backend.\n" + ] + } + ], + "source": [ + "slow_rank_result = interface.get_result(\"cluster\", \"slow_rank\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0e943b2a-37a6-4db6-9e70-235d397f1d39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rank_idcomputecommunicationfree
028976239.079999877586795.4199998116836641.679994211
129012279.1000001026984613.2200000257388343.859991224
229019115.323000517489956.6330000286881360.253991371
329027089.5600000777963312.2399997946389981.899993688
429044786.936999656533618.6390000177780517.1539908135
529178186.2599998537925184.4200000286286867.999995028
629025331.1899999046386639.907999927941798.704992032
729056803.3049995457234444.8260000247094608.035991492
831383314.9800002283973806.61699999968017981.379989724
931360536.362000194757458.8250000027277062.386991671
1031381891.8000004635276870.3599999986731073.659992552
1131387777.380000334727362.30000000457297578.339992355
1231374132.744999775164443.3880000046829798.933991944
1331377800.1789998044360616.2830000017624691.509991412
1431374658.3600003164457099.6200000017542724.319990785
1531387255.5270000065000860.9056975264.115991174
" + ], + "text/plain": [ + "+---------+--------------------+--------------------+--------------------+\n", + "| rank_id | compute | communication | free |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 0 | 28976239.07999987 | 7586795.419999811 | 6836641.679994211 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 1 | 29012279.100000102 | 6984613.220000025 | 7388343.859991224 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 2 | 29019115.32300051 | 7489956.633000028 | 6881360.253991371 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 3 | 29027089.560000077 | 7963312.239999794 | 6389981.899993688 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 4 | 29044786.93699965 | 6533618.639000017 | 7780517.1539908135 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 5 | 29178186.259999853 | 7925184.420000028 | 6286867.999995028 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 6 | 29025331.189999904 | 6386639.90799992 | 7941798.704992032 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 7 | 29056803.304999545 | 7234444.826000024 | 7094608.035991492 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 8 | 31383314.980000228 | 3973806.6169999996 | 8017981.379989724 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 9 | 31360536.36200019 | 4757458.825000002 | 7277062.386991671 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 10 | 31381891.800000463 | 5276870.359999998 | 6731073.659992552 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 11 | 31387777.38000033 | 4727362.3000000045 | 7297578.339992355 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 12 | 31374132.74499977 | 5164443.388000004 | 6829798.933991944 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 13 | 31377800.178999804 | 4360616.283000001 | 7624691.509991412 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 14 | 31374658.360000316 | 4457099.620000001 | 7542724.319990785 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 15 | 31387255.527000006 | 5000860.905 | 6975264.115991174 |\n", + "+---------+--------------------+--------------------+--------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "slow_rank_data = slow_rank_result.get(\"slow_rank_analysis\")\n", + "if slow_rank_data:\n", + " slow_rank_table = PrettyTable(slow_rank_data.get(\"headers\"))\n", + " for row in slow_rank_data.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " slow_rank_table.add_row(row)\n", + " slow_rank_table.hrules = ALL\n", + " display(slow_rank_table[:16])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "57a9b1c6-4127-47a2-8699-3c983950bd84", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescription
slow_rank_analysisComputing has some issues in the cluster, because the max difference of Computing time
has reached 2411.538ms. Communication has some issues in the cluster, because the max
difference of Communication time has reached 3989.506ms.
" + ], + "text/plain": [ + "+--------------------+--------------------------------------------------------------------------------------------------+\n", + "| problem | description |\n", + "+--------------------+--------------------------------------------------------------------------------------------------+\n", + "| slow_rank_analysis | Computing has some issues in the cluster, because the max difference of Computing time |\n", + "| | has reached 2411.538ms. Communication has some issues in the cluster, because the max |\n", + "| | difference of Communication time has reached 3989.506ms. |\n", + "+--------------------+--------------------------------------------------------------------------------------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = slow_rank_result.get(\"problems\")\n", + "headers = problems.get('headers')[:2]\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(headers)\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=100) for element in row]\n", + " problem_table.add_row(row[:2])\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to slow rank analysis.\")" + ] + }, + { + "cell_type": "markdown", + "id": "3511befaff513e8e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## 2)识别通信链路慢" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2a1e617d2a117125", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO]Cluster has been analyzed because of the existence of cluster analysis output directory.\n", + "[INFO]Skip Cluster analyze backend.\n" + ] + } + ], + "source": [ + "slow_link_result = interface.get_result(\"cluster\", \"slow_link\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c8bca314-a8da-4a5b-985a-c36f00154552", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rank_idRDMA bandwidth(GB/s)RDMA size(mb)RDMA time(ms)SDMA bandwidth(GB/s)SDMA size(mb)SDMA time(ms)
00009.766842507.34694399984352.225880000002
100010.165342507.3467759997954181.611080000001
200010.47142507.3467759997954059.527798999999
30009.969142507.3467759997954263.9230400000015
40009.146942507.3467759997954647.202435000001
50009.466342507.3467759997954490.373999999999
60009.569242507.3467759997954442.106745000001
70009.844442507.3467759997954317.931616999999
800018.89542507.3899522249.662369
900018.911242507.390808000062247.7420159999997
1000018.771342507.390808000062264.48576
1100018.838942507.390808000062256.3606000000004
1200018.768742507.390808000062264.8021099999996
1300018.971742507.390808000062240.5713950000004
1400018.922642507.390808000062246.381839999999
1500018.834642507.390808000062256.8781
" + ], + "text/plain": [ + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| rank_id | RDMA bandwidth(GB/s) | RDMA size(mb) | RDMA time(ms) | SDMA bandwidth(GB/s) | SDMA size(mb) | SDMA time(ms) |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 0 | 0 | 0 | 0 | 9.7668 | 42507.3469439998 | 4352.225880000002 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 1 | 0 | 0 | 0 | 10.1653 | 42507.346775999795 | 4181.611080000001 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 2 | 0 | 0 | 0 | 10.471 | 42507.346775999795 | 4059.527798999999 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 3 | 0 | 0 | 0 | 9.9691 | 42507.346775999795 | 4263.9230400000015 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 4 | 0 | 0 | 0 | 9.1469 | 42507.346775999795 | 4647.202435000001 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 5 | 0 | 0 | 0 | 9.4663 | 42507.346775999795 | 4490.373999999999 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 6 | 0 | 0 | 0 | 9.5692 | 42507.346775999795 | 4442.106745000001 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 7 | 0 | 0 | 0 | 9.8444 | 42507.346775999795 | 4317.931616999999 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 8 | 0 | 0 | 0 | 18.895 | 42507.389952 | 2249.662369 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 9 | 0 | 0 | 0 | 18.9112 | 42507.39080800006 | 2247.7420159999997 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 10 | 0 | 0 | 0 | 18.7713 | 42507.39080800006 | 2264.48576 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 11 | 0 | 0 | 0 | 18.8389 | 42507.39080800006 | 2256.3606000000004 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 12 | 0 | 0 | 0 | 18.7687 | 42507.39080800006 | 2264.8021099999996 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 13 | 0 | 0 | 0 | 18.9717 | 42507.39080800006 | 2240.5713950000004 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 14 | 0 | 0 | 0 | 18.9226 | 42507.39080800006 | 2246.381839999999 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 15 | 0 | 0 | 0 | 18.8346 | 42507.39080800006 | 2256.8781 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "slow_link_data = slow_link_result.get(\"slow_link_analysis\")\n", + "if slow_link_data:\n", + " slow_link_table = PrettyTable(slow_link_data.get(\"headers\"))\n", + " for row in slow_link_data.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=60)\n", + " slow_link_table.add_row(row)\n", + " slow_link_table.hrules = ALL\n", + " display(slow_link_table[:16])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "77d6efa1-48e3-409f-82c4-3e2b3d868898", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescription
slow_rank_analysisComputing has some issues in the cluster, because the max difference of Computing time
has reached 2411.538ms. Communication has some issues in the cluster, because the max
difference of Communication time has reached 3989.506ms.
slow_link_analysisSDMA bandwidth(GB/s): The average is 14.332, while the maximum is 18.972GB/s and the
minimum is 9.147GB/s. the difference is 9.825GB/s.
" + ], + "text/plain": [ + "+--------------------+------------------------------------------------------------------------------------------------------+\n", + "| problem | description |\n", + "+--------------------+------------------------------------------------------------------------------------------------------+\n", + "| slow_rank_analysis | Computing has some issues in the cluster, because the max difference of Computing time |\n", + "| | has reached 2411.538ms. Communication has some issues in the cluster, because the max |\n", + "| | difference of Communication time has reached 3989.506ms. |\n", + "| slow_link_analysis | SDMA bandwidth(GB/s): The average is 14.332, while the maximum is 18.972GB/s and the |\n", + "| | minimum is 9.147GB/s. the difference is 9.825GB/s. |\n", + "+--------------------+------------------------------------------------------------------------------------------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = slow_link_result.get(\"problems\")\n", + "headers = problems.get('headers')[:2]\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(headers)\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=100) for element in row]\n", + " problem_table.add_row(row[:2])\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to slow link analysis.\")" + ] + }, + { + "cell_type": "markdown", + "id": "ce27a1d3-1354-45f7-88d8-dcb8e438b2b2", + "metadata": {}, + "source": [ + "## 3) 分布式卡上的kernel算子统计展示" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "466a0f30-042c-492a-bbf2-a5a85b649f95", + "metadata": {}, + "outputs": [], + "source": [ + "from advisor_backend.interface import Interface\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "e05774e9-c47e-400f-8421-b4b71bcdcbc4", + "metadata": {}, + "outputs": [], + "source": [ + "interface = Interface(cluster_path)\n", + "dataset = interface.get_data('cluster', 'kernel')" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "e95b6849-1738-4975-929f-734edff5d1c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rank idNameInput ShapesInput Data TypesOutput ShapesDuration(us)_meanDuration(us)_varDuration(us)_maxDuration(us)_minDuration(us)_countDuration(us)_sum
00Add100\"4096,10880;4096,10880\"FLOAT;FLOAT\"4096,10880\"478.210918237.729252721.420449.801024489687.980
10Add102\"21760;21760\"FLOAT;FLOAT\"21760\"4.3903910.0119154.8203.9810244495.760
20Add106\"21760,4096;21760,4096\"FLOAT;FLOAT\"21760,4096\"933.504395462.9793211257.140927.381024955908.500
30Add111\"4096,4096;4096,4096\"FLOAT;FLOAT\"4096,4096\"91.2673632.15827597.12085.12102493457.780
40Add118\"12288,4096;12288,4096\"FLOAT;FLOAT\"12288,4096\"526.3120121462.617511787.780424.241024538943.500
....................................
251315trans_Cast_12\"4096,1,1,128\"FLOAT\"4096,1,1,128\"8.4864950.0601749.8208.20204817380.342
251415trans_Cast_13\"4096,1,1,128\"FLOAT\"4096,1,1,128\"10.5345640.16638012.9009.48204821574.787
251515trans_Cast_14\"4096,1,1,128\"FLOAT\"4096,1,1,128\"9.7845510.29536813.0218.56204820038.761
251615trans_Cast_15\"4096,1,1,128\"DT_BF16\"4096,1,1,128\"8.3422110.12047110.2207.86204817084.848
251715trans_Cast_16\"4096,1,1,128\"DT_BF16\"4096,1,1,128\"9.5075890.11711111.6819.18204819471.543
\n", + "

2518 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " rank id Name Input Shapes Input Data Types \\\n", + "0 0 Add100 \"4096,10880;4096,10880\" FLOAT;FLOAT \n", + "1 0 Add102 \"21760;21760\" FLOAT;FLOAT \n", + "2 0 Add106 \"21760,4096;21760,4096\" FLOAT;FLOAT \n", + "3 0 Add111 \"4096,4096;4096,4096\" FLOAT;FLOAT \n", + "4 0 Add118 \"12288,4096;12288,4096\" FLOAT;FLOAT \n", + "... ... ... ... ... \n", + "2513 15 trans_Cast_12 \"4096,1,1,128\" FLOAT \n", + "2514 15 trans_Cast_13 \"4096,1,1,128\" FLOAT \n", + "2515 15 trans_Cast_14 \"4096,1,1,128\" FLOAT \n", + "2516 15 trans_Cast_15 \"4096,1,1,128\" DT_BF16 \n", + "2517 15 trans_Cast_16 \"4096,1,1,128\" DT_BF16 \n", + "\n", + " Output Shapes Duration(us)_mean Duration(us)_var Duration(us)_max \\\n", + "0 \"4096,10880\" 478.210918 237.729252 721.420 \n", + "1 \"21760\" 4.390391 0.011915 4.820 \n", + "2 \"21760,4096\" 933.504395 462.979321 1257.140 \n", + "3 \"4096,4096\" 91.267363 2.158275 97.120 \n", + "4 \"12288,4096\" 526.312012 1462.617511 787.780 \n", + "... ... ... ... ... \n", + "2513 \"4096,1,1,128\" 8.486495 0.060174 9.820 \n", + "2514 \"4096,1,1,128\" 10.534564 0.166380 12.900 \n", + "2515 \"4096,1,1,128\" 9.784551 0.295368 13.021 \n", + "2516 \"4096,1,1,128\" 8.342211 0.120471 10.220 \n", + "2517 \"4096,1,1,128\" 9.507589 0.117111 11.681 \n", + "\n", + " Duration(us)_min Duration(us)_count Duration(us)_sum \n", + "0 449.80 1024 489687.980 \n", + "1 3.98 1024 4495.760 \n", + "2 927.38 1024 955908.500 \n", + "3 85.12 1024 93457.780 \n", + "4 424.24 1024 538943.500 \n", + "... ... ... ... \n", + "2513 8.20 2048 17380.342 \n", + "2514 9.48 2048 21574.787 \n", + "2515 8.56 2048 20038.761 \n", + "2516 7.86 2048 17084.848 \n", + "2517 9.18 2048 19471.543 \n", + "\n", + "[2518 rows x 11 columns]" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "27b75df4-792b-43dc-aa5c-d3c265642c1e", + "metadata": {}, + "outputs": [], + "source": [ + "# 保存到csv查看, 可修改保存路径\n", + "dataset.to_csv('cluster_kernel_details.csv', index=False, sep='\\t')" + ] + }, + { + "cell_type": "markdown", + "id": "ae45826394463cc4", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## 4) 展示集群流水并行图\n", + "使用说明: \n", + "1). 需要使用Ascend Torch Profiler采集数据,如果需要展示FP和BP需要将activities设置为采集CPU和NPU \n", + "2). rank_ids为要展示的rank id列表,必选参数, 可视化顺序与rank_ids的顺序一致 \n", + "3). worker_num为多进程数量,可选参数,请根据机器配置调整,默认值为机器可用核心数的一半 \n", + "4). 如果没有采集CPU数据,则展示Stage和Bubble的流水图 \n", + "5). 生成的json文件可以在chrome trace中查看 \n", + "\n", + "示例图:\n", + "![pipeline_view](../../profiler/test/resource/pipeline_view.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "baf66781eccfbca1", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] Start to process 8 rank profiling data with 8 workers.\n", + "[INFO] Pipline view data process finished, cost 98.48s.\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "# rank_ids为要呈现的rank id列表,必选参数\n", + "# 可以使用列表推导式生成需要的rank_ids,最终展示顺序和rank_ids的顺序一致\n", + "# worker_num为多进程数量,可选参数,请根据机器配置调整,默认值为机器可用核心数的一半\n", + "dataset = interface.get_data(\"cluster\", \"pipeline\", rank_ids=[0, 1, 2, 3, 4, 5, 6, 7], worker_num=8)\n", + "\n", + "# 保存json数据,在chrome trace中查看\n", + "with open(\"./pipeline_view.json\", \"w\") as f:\n", + " json.dump(dataset.get(\"data\", []), f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f34ecf5-5c4a-4bc0-a761-e6338e534bac", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/profiler/advisor_review/common/__init__.py b/profiler/advisor_review/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/common/analyzer_scopes.py b/profiler/advisor_review/common/analyzer_scopes.py new file mode 100644 index 0000000000000000000000000000000000000000..592f9d421e2bfad53a9ea621d951ae0166221623 --- /dev/null +++ b/profiler/advisor_review/common/analyzer_scopes.py @@ -0,0 +1,14 @@ +class SupportedScopes: + + # used for specify fourth-level commands and define the key of the result dict + # the key defined bellow must be the same as value + TIMELINE_FUSION_OPS = "timeline_fusion_ops" + GRAPH = "graph" + SLOW_RANK = "slow_rank" + SLOW_LINK = "slow_link" + OVER_ALL = "over_all" + DYNAMIC_SHAPE_ANALYSIS = "dynamic_shape_analysis" + AICPU_ANALYSIS = "aicpu_analysis" + BLOCK_DIM_ANALYSIS = "block_dim_analysis" + OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" + TIMELINE_OP_DISPATCH = "timeline_op_dispatch" diff --git a/profiler/advisor_review/common/constant.py b/profiler/advisor_review/common/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..40aaac94b1c1e7f88a56c8a5b0d15e8814b9f61d --- /dev/null +++ b/profiler/advisor_review/common/constant.py @@ -0,0 +1,140 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# timeline +DEQUEUE = "Dequeue" +DEQUEUE_SEP = "@" +ATEN = "aten" +NPU = "npu" +ATEN_SEP = "::" +OPTIMIZER = "Optimizer" +OPTIMIZER_SEP = "#" +OPTIMIZER_STEP = "step" +ENQUEUE = "enqueue" +TORCH_TO_NPU = "torch_to_npu" +OP_COMPILE_NAME = "AscendCL@aclopCompileAndExecute" +OP_COMPILE_ID = "aclopCompileAndExecute" +MAX_OP_COMPILE_NUM = 20 +ACL_TO_NPU = "acl_to_npu" +TASK_TYPE = "Task Type" +CPU_OP = "cpu_op" +AI_CORE = "AI_CORE" +AI_CPU = "AI_CPU" +CALL_STACKS = "Call stack" +INPUT_DIMS = "Input Dims" +OP_SEP = "-" +MA_ADVISOR_MAX_PROCESSES = 16 +MA_ADVISOR_ANALYZE_PROCESSES = "MA_ADVISOR_ANALYZE_PROCESSES" +TIMELINE_OP_STACKS_DATASET = "timeline_op_stacks_dataset" +TIMELINE_BACKWARD_NO_STACK = "Backward broadcast, without call stacks in profiling." +TIMELINE_ACL_TO_NPU_NO_STACK = "Incoming flow is 'acl_to_npu', without call stacks in profiling." +TIMELINE_BACKWARD_NO_STACK_CODE = -1 +TIMELINE_ACL_TO_NPU_NO_STACK_CODE = -2 +TIMELINE_FUSION_OPS_NO_STACK_FLAG = "NO STACK" +NO_STACK_REASON_MAP = { + TIMELINE_BACKWARD_NO_STACK_CODE: "Backward broadcast, without call stacks in profiling.", + TIMELINE_ACL_TO_NPU_NO_STACK_CODE: "Incoming flow is 'acl_to_npu', without call stacks in profiling." +} +TIMELINE_API_DOC_URL = "https://support.huaweicloud.com/bestpractice-modelarts/modelarts_10_2516.html" +AFFINITY_TRAINING_API = "Affinity training api" +TIMELINE_WITH_STACK_DOC_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/" \ + "70RC1/modeldevpt/ptmigr/AImpug_0067.html" +PyTorch_AOE_OPERATOR_TUNE_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/" \ + "70RC1/devtools/auxiliarydevtool/aoe_16_045.html" +MSLite_Infer_AOE_OPEATOR_TUNE_URL = "https://www.mindspore.cn/lite/docs/en/master/use/cloud_infer/converter_tool_ascend.html#aoe-auto-tuning" +ENABLE_COMPILED_TUNE_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/" \ + "70RC1/modeldevpt/ptmigr/AImpug_0059.html" + +ASCEND_PROFILER_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/modeldevpt/ptmigr/AImpug_0067.html" +TIMELINE_EMPTY_STACKS_PROMPT = "These APIs have no code stack. If parameter 'with_stack=False' while profiling, " \ + "please refer to {timeline_profiling_doc_url} to set 'with_stack=True'. " \ + "Otherwise, ignore following affinity APIs due to backward broadcast lack of stack." + +CLUSTER_ANALYSIS = "Cluster analysis" +SLOW_RANK_TIME_RATIO_THRESHOLD = 0.05 + +# version_control +CANN_VERSION_C30 = '6.3.RC2' +CANN_VERSION_C13 = '7.0.RC1' +CANN_VERSION_C15 = '7.0.0' +CANN_VERSION_C17 = '8.0.0' +SUPPORTED_CANN_VERSION = [CANN_VERSION_C30, CANN_VERSION_C13, CANN_VERSION_C15, CANN_VERSION_C17] +DEFAULT_CANN_VERSION = CANN_VERSION_C17 +ASCEND_PYTORCH_PROFILER = "ascend_pytorch_profiler" +MSLITE = "mslite" +MSPROF = "msprof" +SUPPORTED_PROFILING_TYPE = [ASCEND_PYTORCH_PROFILER, MSLITE, MSPROF] +DEFAULT_PROFILING_TYPE = ASCEND_PYTORCH_PROFILER +TORCH_VERSION_1_11_0 = '1.11.0' +TORCH_VERSION_2_1_0 = '2.1.0' + +SUPPORTED_TORCH_VERSION = [TORCH_VERSION_1_11_0, TORCH_VERSION_2_1_0] +DEFAULT_TORCH_VERSION = TORCH_VERSION_2_1_0 + +TERMINAL_OUTPUT_HEADERS = ["No.", "Problem", "Description", "Suggestion"] +SKIP_ANALYZE_PROMPT = "Finish analysis, no optimization suggestions" +SKIP_QUERY_PROMPT = "Finish query operator stack, no operators" + +# operator output constant +OPERATOR_OUT_TOPK = 10 +OPERATOR_LIST_UNLIMIT = -1 + +DEFAULT_OPERATOR_TYPE = 'None_type' +DEFAULT_DURATION_ZERO = 0.0 + +ADVISOR_LOG_LEVEL = "ADVISOR_LOG_LEVEL" +DEFAULT_LOG_LEVEL = "INFO" +SUPPORTED_LOG_LEVEL = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] + +RULE_BUCKET = "RULE-BUCKET" +CLOUD_RULE_REGION_CN_NORTH_9 = "cn-north-9" +CLOUD_RULE_REGION_CN_NORTH_7 = "cn-north-7" +CLOUD_RULE_REGION_CN_SOUTHWEST_2 = "cn-southwest-2" +CLOUD_RULE_REGION_LIST = [CLOUD_RULE_REGION_CN_NORTH_7, CLOUD_RULE_REGION_CN_NORTH_9, CLOUD_RULE_REGION_CN_SOUTHWEST_2] +INNER_REGION_LIST = [CLOUD_RULE_REGION_CN_NORTH_7] +DEFAULT_CLOUD_RULE_REGION = CLOUD_RULE_REGION_CN_SOUTHWEST_2 + +HTTP_PREFIXES = "http://" +HTTPS_PREFIXES = "https://" +COMMON_YAML_DIR = "modelarts/solution/ma_advisor_rules/" +COMMON_ENDPOINT_SUFFIX = "obs.{}.myhuaweicloud.com" +INNER_ENDPOINT_SUFFIX= "obs.{}.ulanqab.huawei.com" + +AICPU_RULES_YAML_NAME = "aicpu_rules.yaml" +FUSION_PASS_YAML_NAME = "op_fusion_pass.yaml" +TIMELINE_FUSION_OPS_YAML_NAME = "timeline_fusion_ops.yaml" +CLOUD_YAML_NAME_LIST = [AICPU_RULES_YAML_NAME, FUSION_PASS_YAML_NAME, TIMELINE_FUSION_OPS_YAML_NAME] + +MAX_RETRIES = 3 +TIMEOUT = 3 + +ADVISOR_RULE_PATH = "ADVISOR_RULE_PATH" +CLOUD_RULE_PATH = "rules/cloud/" +DEFAULT_RULE_PATH = "./rules/" + +TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID = -1 + +DEFAULT_TEMPLATE_HEADER = "Performance Optimization Suggestions" + +PT_PROF_SUFFIX = "ascend_pt" +ASCEND_PROFILER_OUTPUT = "ASCEND_PROFILER_OUTPUT" +COLLECTION_PATH = "collection_path" +CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output" +KERNEL_DETAILS_CSV = "kernel_details.csv" +CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv" +CLUSTER_COMM_JSON = "cluster_communication.json" + +BOTTLENECK = "bottleneck" +DATA = "data" \ No newline at end of file diff --git a/profiler/advisor_review/common/graph/__init__.py b/profiler/advisor_review/common/graph/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/common/graph/graph.py b/profiler/advisor_review/common/graph/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..6bab2042de3a09f9317f71fc6a5c9740743cc790 --- /dev/null +++ b/profiler/advisor_review/common/graph/graph.py @@ -0,0 +1,135 @@ +import logging +from typing import Dict, List, Tuple, Callable, Any, Optional, Union + +import networkx as nx + +from profiler.advisor.common.graph.graph_parser import HostGraphNode, QueryGraphNode + +logger = logging.getLogger() + + +class Graph: + """ + Graph Struct + """ + + # pylint: disable=too-many-instance-attributes + def __init__(self, + nodes: Dict[str, Optional[Union[HostGraphNode, QueryGraphNode]]] = None, + edges: List[Tuple[Optional[Union[HostGraphNode, QueryGraphNode]], + Optional[Union[HostGraphNode, QueryGraphNode]]]] = None, + name: str = None): + self.name = name + self.graph = nx.DiGraph(name=name) + self.nodes = nodes if nodes is not None else {} + self.edges = edges if edges is not None else list() + + def build(self): + for op_name, node in self.nodes.items(): + # add node and mark op_name as tag + self.add_node(node, + op_type=node.op_type + ) + for edge in self.edges: + self.add_edge(*edge) + return self.graph + + def get_size(self) -> Dict[str, int]: + if not hasattr(self.graph, "nodes"): + return {"edges": 0, "nodes": 0} + + return {"edges": len(self.graph.edges), + "nodes": len(self.graph.nodes)} + + def add_node(self, node: HostGraphNode, **kwargs): + if node is None: + return + self.graph.add_node(node, **kwargs) + + def add_edge(self, pre_node: HostGraphNode, next_node: HostGraphNode): + if pre_node is None or next_node is None: + return + + if pre_node not in self.graph or \ + next_node not in self.graph: + logging.error("Nodes between edge should be both exists.") + return + + self.graph.add_edge(pre_node, next_node) + + def add_node_with_edge(self, node, adj_nodes: List[HostGraphNode]): + self.add_node(node) + for adj in adj_nodes: + self.add_edge(node, adj) + + def remove_node(self, node: HostGraphNode = None) -> None: + if node is None: + return + + self.graph.remove_node(node) + + def remove_edge(self, pre_node: HostGraphNode = None, next_node: HostGraphNode = None) -> None: + if pre_node is None or next_node is None: + raise ValueError(f"Invalid edge from {pre_node} to {pre_node}.") + + self.remove_edge(pre_node, next_node) + + def get_subgraph(self, nodes: List[HostGraphNode]) -> nx.DiGraph: + nodes = list(set(nodes)) + for node in nodes: + if not self.is_node_exists(node): + raise ValueError(f"Failed to subtract subgraph because {node.op_name} is not in the graph.") + + return self.graph.subgraph(nodes) + + def highlight_subgraph(self, subgraph: nx.DiGraph = None) -> None: + pass + + def get_node(self, node: HostGraphNode): + if node not in self.graph: + return + + return self.graph[node] + + def get_node_by_name(self, node_name: str): + return self.nodes.get(node_name, None) + + def is_node_exists(self, node: HostGraphNode): + return node in self.graph + + def draw(self, + graph: nx.DiGraph = None, + with_labels: bool = False, + labels: Dict[HostGraphNode, Any] = None, + pos_func: Callable = None, + font_weight: str = "bold", + savefig: bool = False, + node_size: int = 50, + **kwargs + ): + try: + import matplotlib.pylab as plt + except ImportError: + logger.error('Please install matplotlib first by using `pip install matplotlib`.') + return + + if graph is None: + graph = self.graph + + pos = pos_func(graph) if pos_func is not None else None + + if with_labels: + if labels is None: + labels = {k: f"{k}\n({v['op_name']})" for k, v in graph.nodes.items()} + + nx.draw(graph, + with_labels=with_labels, + pos=pos, + node_size=node_size, + font_weight=font_weight, + labels=labels, + **kwargs + ) + if savefig: + plt.savefig(self.name + ".png") + plt.show() diff --git a/profiler/advisor_review/common/graph/graph_match.py b/profiler/advisor_review/common/graph/graph_match.py new file mode 100644 index 0000000000000000000000000000000000000000..d0dfc162952b0c52bf9ed73cef2ff18ff5ffda24 --- /dev/null +++ b/profiler/advisor_review/common/graph/graph_match.py @@ -0,0 +1,355 @@ +import itertools +import logging +from functools import lru_cache +from collections import deque +from typing import Dict, Generator, List, Callable, Hashable, Tuple + +import networkx as nx + + +@lru_cache() +def match_node_attr_fun(query_node: Hashable, + host_node: Hashable, + query_graph: nx.Graph, + host_graph: nx.Graph + ) -> bool: + """ + Check query node matches the attributes in host graph + + :param query_node: Query graph node + :param host_node: Host graph node + :param query_graph: Query Graph + :param host_graph: Host graph + :return: bool, match or not + """ + # get node attr + if query_node not in query_graph.nodes or host_node not in host_graph.nodes: + return False + + query_node = query_graph.nodes[query_node] + host_node = host_graph.nodes[host_node] + for attr, val in query_node.items(): + if attr not in host_node: + return False + if isinstance(host_node[attr], str) and isinstance(val, str): + if host_node[attr].lower() != val.lower(): + return False + else: + if host_node[attr] != val: + return False + return True + + +@lru_cache() +def match_node_struct_fun(query_node: Hashable, + host_node: Hashable, + query_graph: nx.Graph, + host_graph: nx.Graph + ) -> bool: + """ + Check query node matches the structure in host graph + + :param query_node: Query graph node + :param host_node: Host graph node + :param query_graph: Query Graph + :param host_graph: Host graph + :return: bool, match or not + """ + if query_node not in query_graph.nodes or host_node not in host_graph.nodes: + return False + + return host_graph.degree(host_node) >= query_graph.degree(query_node) + + +@lru_cache() +def match_edge_attr_fun(query_edge: Tuple[Hashable, Hashable], + host_edge: Tuple[Hashable, Hashable], + query_graph: nx.Graph, + host_graph: nx.Graph + ) -> bool: + """ + Check query edge matches the attr in host graph + + :param query_edge: Query graph edge + :param host_edge: Host graph edge + :param query_graph: Query Graph + :param host_graph: Host graph + :return: bool, match or not + """ + # get edge attr + if query_edge not in query_graph.edges or host_edge not in host_graph.edges: + return False + + query_edge = query_graph.edges[query_edge] + host_edge = host_graph.edges[host_edge] + for attr, val in query_edge.items(): + if attr not in host_edge: + return False + if isinstance(host_edge[attr], str) and isinstance(val, str): + if host_edge[attr].lower() != val.lower(): + return False + else: + if host_edge[attr] != val: + return False + return True + + +def find_isomorphisms(query_graph: nx.Graph, + host_graph: nx.Graph, + *args, + _node_attr_fun: Callable = match_node_attr_fun, + _node_struct_fun: Callable = match_node_struct_fun, + _edge_attr_fun: Callable = match_edge_attr_fun, + limit: int = None, + **kwargs) -> List[Dict[Hashable, Hashable]]: + """ + Find all the sub graphs that are isomorphic to query_graph in host_graph . + + :param query_graph: The graph object to query + :param host_graph: The graph object to be queried + :param args: Position args + :param _node_attr_fun: The function to match node attr + :param _node_struct_fun: The function to match node structural + :param _edge_attr_fun: The function to match edge attr + :param limit: The limitation for the number of returned mappings + :param kwargs: Keyword args + :return: Matched node mapping list + ``` + [{query_id: host_id, ...}, ...] + ``` + """ + candidates = [] + for query_result in find_isomorphisms_iter( + query_graph, + host_graph, + *args, + _node_attr_fun=_node_attr_fun, + _node_struct_fun=_node_struct_fun, + _edge_attr_fun=_edge_attr_fun, + **kwargs + ): + candidates.append(query_result) + if limit and len(candidates) >= limit: + return candidates + return candidates + + +def find_isomorphisms_iter(query_graph: nx.Graph, + host_graph: nx.Graph, + directed: bool = None, + _node_attr_fun: Callable = None, + _node_struct_fun: Callable = None, + _edge_attr_fun: Callable = None, + ) -> Generator[Dict[Hashable, Hashable], None, None]: + """ + A generation to find one isomorphic subgraph in host_graph for query_graph. + + :param query_graph: The graph object to query + :param host_graph: The graph object to be queried + :param directed: Whether direction should be considered during search + :param _node_attr_fun: The function to match node attr + :param _node_struct_fun: The function to match node structural + :param _edge_attr_fun: The function to match edge attr + :return: Yield mappings from query node IDs to host graph IDs: {query_id: host_id, ...} + + """ + if directed is None: + # query graph and host graph should consider directions. + if isinstance(query_graph, nx.DiGraph) and \ + isinstance(host_graph, nx.DiGraph): + directed = True + else: + directed = False + + # Initialize queue + dq = deque() + dq.appendleft({}) + + while len(dq) > 0: + backbone = dq.pop() + next_candidate_backbones = get_next_candidates(backbone=backbone, + query_graph=query_graph, + host_graph=host_graph, + directed=directed, + _node_attr_fun=_node_attr_fun, + _node_struct_fun=_node_struct_fun, + _edge_attr_fun=_edge_attr_fun, + ) + for candidate in next_candidate_backbones: + # find a legal isomorphism + if len(candidate) == len(query_graph): + yield candidate + else: + # continue to search + dq.appendleft(candidate) + + +def get_next_candidates( + backbone: Dict, + query_graph: nx.Graph, # noqa + host_graph: nx.Graph, # noqa + next_node: Hashable = None, + directed: bool = True, # noqa + _node_attr_fun: Callable = None, # noqa + _node_struct_fun: Callable = None, # noqa + _edge_attr_fun: Callable = None # noqa +) -> List[Dict[Hashable, Hashable]]: + """ + Get a list of candidate node assignments for the next "step" of this map. + + :param backbone: Mapping of query node IDs to one set of host graph IDs + :param next_node: Optional suggestion for the next node to assign + :return: List[Dict[Hashable, Hashable]]: A new list of node mappings with one additional element mapped + """ + node_priority = {n: 1 for n in query_graph.nodes} + candidate_nodes = [] + + if next_node is None and len(backbone) == 0: + # Start case + next_node = max(node_priority.keys(), + key=lambda x: node_priority.get(x, 0)) + + for node in host_graph.nodes: + if _node_attr_fun(next_node, node, query_graph, host_graph) and \ + _node_struct_fun(next_node, node, query_graph, host_graph): + candidate_nodes.append({next_node: node}) + return candidate_nodes + + nodes_with_maximum_backbone = [] + for query_node_id in query_graph.nodes: + if query_node_id in backbone: + continue + + backbone_neighbors = [] + if not directed: + backbone_neighbors = query_graph.adj[query_node_id] + else: + # nx.DiGraph.pred: A <- B: find previous node from B to A + # nx.DiGraph.adj: A -> B : find next node from A to B + backbone_neighbors = list(set(query_graph.adj[query_node_id]).union(set(query_graph.pred[query_node_id]))) + + query_backbone_node_count = sum([1 for _node in backbone_neighbors if _node in backbone]) + if query_backbone_node_count > 0: + # Find a longer backbone node + nodes_with_maximum_backbone.append(query_node_id) + + # next_node is connected to the current backbone. + next_node = max(nodes_with_maximum_backbone, key=lambda x: node_priority.get(x, 0)) + + # verify all edges between `next_node` and nodes in the backbone are exist in host graph + # Step1: find all edges between `next_node` and nodes in the backbone + next_edge_edges = [] + for _node in query_graph.adj[next_node]: + if _node in backbone: + # `next_node` -> `_node` + next_edge_edges.append((None, next_node, _node)) + + if directed: + for _node in query_graph.pred[next_node]: + if _node in backbone: + # `_node` -> `next_node` + next_edge_edges.append((_node, next_node, None)) + + if len(next_edge_edges) == 0: + logging.warning("Find node without any edge, which is invalid.") + return [] + # Step2: verify candidate nodes that have such edges in the host graph + candidate_nodes = [] + if len(next_edge_edges) == 1: + source, _, target = next_edge_edges[0] + if not directed: + candidate_nodes = list(host_graph.adj[backbone[target]]) + else: + if source is not None: + # means `source` is a `from` edge + candidate_nodes = list(host_graph.adj[backbone[source]]) + elif target is not None: + # means `target` is a `from` edge + candidate_nodes = list(host_graph.pred[backbone[target]]) + + elif len(next_edge_edges) > 1: + candidate_nodes_set = set() + for (source, _, target) in candidate_nodes: + if not directed: + candidate_nodes_from_this_edge = host_graph.adj[backbone[target]] + else: + if source is not None: + candidate_nodes_from_this_edge = host_graph.adj[backbone[source]] + else: # target is not None: + candidate_nodes_from_this_edge = host_graph.pred[backbone[target]] + + if len(candidate_nodes_set) > 0: + candidate_nodes_set = candidate_nodes_set.intersection(candidate_nodes_from_this_edge) + else: + # Initialize candidate_nodes_set + candidate_nodes_set.update(candidate_nodes_from_this_edge) + candidate_nodes = list(candidate_nodes_set) + + tentative_results = [] + for _node in candidate_nodes: + if all([_node not in backbone.values(), + _node_attr_fun(next_node, _node, query_graph, host_graph), + _node_struct_fun(next_node, _node, query_graph, host_graph)] + ): + tentative_results.append({**backbone, + next_node: _node}) + + final_candidates = check_edges_mapping(tentative_results, + query_graph=query_graph, + host_graph=host_graph, + _edge_attr_fun=_edge_attr_fun) + return final_candidates + + +def check_edges_mapping(candidates: List[Dict[Hashable, Hashable]], + query_graph: nx.Graph, + host_graph: nx.Graph, + _edge_attr_fun: Callable = None + ) -> List[Dict[Hashable, Hashable]]: + """ + Check that all edges between the assigned nodes exist in the host graph. + + :param candidates: mapping nodes candidates + :param query_graph: The graph object to query + :param host_graph: The graph object to be queried + :param _edge_attr_fun: The function to match edge attr + :return: + """ + monomorphism_candidates = [] + + for candidate in candidates: + if len(candidate) != len(query_graph): + monomorphism_candidates.append(candidate) + continue + + all_pass_flag = True + for edge_start, edge_end in query_graph.edges: + # check edge in host graph + if not host_graph.has_edge(candidate[edge_start], candidate[edge_end]): + all_pass_flag = False + break + + # check edge attr + if _edge_attr_fun is None or not _edge_attr_fun( + (edge_start, edge_end), + (candidate[edge_start], candidate[edge_end]), + query_graph, + host_graph + ): + all_pass_flag = False + break + + if all_pass_flag: + monomorphism_candidates.append(candidate) + + # Isomorphisms check + final_candidates = [] + for candidate in monomorphism_candidates: + all_product = itertools.product(candidate.keys(), candidate.keys()) + for edge_start, edge_end in all_product: + if not query_graph.has_edge(edge_start, edge_end) and \ + host_graph.has_edge(candidate[edge_start], candidate[edge_end]): + break + else: + final_candidates.append(candidate) + return final_candidates diff --git a/profiler/advisor_review/common/graph/graph_parser.py b/profiler/advisor_review/common/graph/graph_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..d4c67fc1918af37a837e016bd9e5b813957b1aef --- /dev/null +++ b/profiler/advisor_review/common/graph/graph_parser.py @@ -0,0 +1,413 @@ +import os +import logging +import yaml +import itertools +from collections import deque +from dataclasses import dataclass +from typing import List, Tuple, Dict + +logger = logging.getLogger() + + +@dataclass +class Tensor: + def __init__(self): + super().__init__() + self.shape = [] + self.origin_shape = [] + self.shape_range = [] + self.origin_shape_range = [] + self.dtype = "" + self.origin_data_type = "" + self.format = "" + self.origin_format = [] + + +@dataclass +class Attr: + + def __init__(self): + super().__init__() + self.key = str() + self.value = [] + + +class HostGraphNode: + def __init__(self): + super().__init__() + self.graph_name = str() + self.op_name = str() + self.op_type = str() + self.inputs = [] + self.input = [] + self.outputs = [] + self.output = [] + self.strides = [] + self.pads = [] + self.groups = "" + self.dilations = [] + self.kernelname = "" + self._attrs = [] + + def __repr__(self): + return f"" + + +@dataclass +class HostGraph: + def __init__(self): + super().__init__() + self.name = "" + self.nodes = {} + self.inputs = [] + self.edges = [] + self.model_name = None + self.file_path = None + + def build(self): + """build a graph""" + for name, node in self.nodes.items(): + for input_node in node.inputs: + if input_node not in self.nodes: + continue + self.nodes[input_node].outputs.append(name) + + +class HostGraphParser: + """ + Parse graph metadata from text file + """ + def __init__(self, file_path): + self.buffer = deque(maxlen=100) + self.line_no = 0 + self._file_path = file_path + self.edges: List[Tuple[HostGraphNode, HostGraphNode]] = [] + self.nodes: Dict[str, HostGraphNode] = {} + self.graphs = self._parse(self._file_path) + self._get_node_dict() + self._get_edges_list() + del self.graphs[0] + + @staticmethod + def _get_key_value( line): + res = line.split(':', 1) + return res[0].strip(), res[1].strip().strip('"') + + @staticmethod + def _parse_attr(key, value, obj): + if not isinstance(obj, list) and not obj: + return + if key == "dim" and hasattr(obj, "shape"): + obj.shape.append(value) + elif key == "name" and hasattr(obj, "op_name"): + obj.op_name = value + elif key == "name" and hasattr(obj, "name"): + obj.name = value + elif key == "dtype" and hasattr(obj, "dtype"): + obj.dtype = value + elif key == "layout" and hasattr(obj, "format"): + obj.format = value + elif key == "type" and hasattr(obj, "op_type"): + obj.op_type = value + elif key == "input" and hasattr(obj, "input"): + obj.inputs.append(value.strip('"').split(':')[0]) + elif key == "key" and hasattr(obj, "key"): + obj.key = value + elif hasattr(obj, key): + setattr(obj, key, value) + elif isinstance(obj, list) and key != "val_type": + obj.append(value) + + def _parse_struct(self, in_file, key, in_obj): + + def parse_shape(file, obj): + obj = self._parse_line(file, obj) + + def parse_input_desc(file, obj): + tensor = self._parse_line(file, Tensor()) + if obj and hasattr(obj, "input"): + obj.input.append(tensor) + + def parse_out_desc(file, obj): + tensor = self._parse_line(file, Tensor()) + if obj and hasattr(obj, "output"): + obj.output.append(tensor) + + def parse_op(file, obj: HostGraph): + node = self._parse_line(file, HostGraphNode()) + if hasattr(obj, "name"): + node.graph_name = obj.name + if obj and hasattr(obj, "nodes") and node.op_name: + obj.nodes[node.op_name] = node + + def parse_graph(file, obj): + graph = self._parse_line(file, HostGraph()) + obj.append(graph) + + def parse_attr(file, obj): + attr = self._parse_line(file, Attr()) + if hasattr(obj, attr.key): + if attr.key not in ['format']: + setattr(obj, attr.key, attr.value) + elif attr.key.endswith("_kernelname"): + setattr(obj, "kernelname", attr.value) + if obj and hasattr(obj, "get_attrs"): + obj.get_attrs().append(attr) + + def parse_list(file, obj): + value = [] + self._parse_line(file, value) + if isinstance(obj, list): + obj.append(value) + else: + obj = value + + def parse_value(file, obj): + if hasattr(obj, "value"): + obj.value = self._parse_line(file, obj.value) + + def parse_default(file, _obj=None): + """function with unused argument""" + self._parse_line(file, None) + + parse_methods = { + "shape": parse_shape, + "input_desc": parse_input_desc, + "output_desc": parse_out_desc, + "op": parse_op, + "graph": parse_graph, + "attr": parse_attr, + "list_list_int": parse_list, + "list_list_i": parse_list, + "list": parse_list, + "value": parse_value, + } + parse_methods.get(key, parse_default)(in_file, in_obj) + + def _read_line(self, file): + self.line_no += 1 + line = file.readline() + if line.strip().endswith('}'): + end_line = "" + while self.buffer and not end_line.strip().endswith("{"): + end_line = self.buffer.pop() + else: + self.buffer.append(line) + return line.strip() + + def _parse_line(self, file, obj=None): + line = self._read_line(file) + try: + while line and not line.endswith("}"): + if line.endswith('{'): + key = line.rstrip('{').strip() + self._parse_struct(file, key, obj) + else: + key, value = self._get_key_value(line) + self._parse_attr(key, value, obj) + line = self._read_line(file) + except Exception as exception: + if self.buffer: + logger.debug("***********************graph content**************************") + while self.buffer: + line = self.buffer.popleft() + logger.debug(line) + logger.debug("***********************graph content**************************") + raise exception + return obj + + def _parse(self, graph_file): + # pylint:disable=broad-except + graph_list = [] + with open(graph_file, "r", encoding="gbk") as file: + try: + graph_list = self._parse_line(file, graph_list) + except Exception: + logger.error( + "Parse line %s of file %s failed, make sure the format is correct.", self.line_no, graph_file + ) + graphs = [] + for graph in graph_list: + if isinstance(graph, HostGraph): + graphs.append(graph) + for graph in graphs: + graph.model_name = graphs[0].name + graph.file_path = self._file_path + graph.build() + return graphs + + def _get_edges_list(self) -> None: + if len(self.graphs) <= 0: + return + + def is_repeat_edge(edge, edge_collector): + for _edge in edge_collector: + if edge[0].op_name == _edge[0].op_name and edge[1].op_name == _edge[1].op_name: + return True + return False + + for node in self.nodes.values(): + for input_node_name in node.inputs: + if input_node_name not in self.nodes: + continue + input_node = self.nodes[input_node_name] + if not is_repeat_edge((input_node, node), self.edges): + self.edges.append((input_node, node)) + for output_node_name in node.outputs: + if output_node_name not in self.nodes: + continue + output_node = self.nodes[output_node_name] + if not is_repeat_edge((node, output_node), self.edges): + self.edges.append((node, output_node)) + + def _get_node_dict(self) -> None: + if not self.graphs: + self.nodes = {} + return + self.nodes = {node.op_name: node for graph in self.graphs for node in graph.nodes.values()} + + +class QueryGraphNode: + """ + Graph Node + """ + _ID = 0 + + def __init__(self, op_type: str, op_pass: str): + self._op_type = op_type + self._id = QueryGraphNode._ID + self._op_pass = op_pass + QueryGraphNode._ID += 1 + + def get_property(self, name): + """ + get property + """ + return getattr(self, name, lambda: None) + + @property + def op_type(self): + return self._op_type + + @property + def op_name(self): + return self._op_type + "_id_" + str(self._id) + + @property + def op_pass(self): + return self._op_pass + + @op_type.setter + def op_type(self, op_type): + self._op_type = op_type + + def __eq__(self, other): + return self._op_type == other._op_type and \ + self._id == other._id + + def __hash__(self): + return hash(self._op_type + str(self._id)) + + @staticmethod + def trim_string(string: str, length: int = -1): + """ + + Trim string to target length + :param string: Original string + :param length: Target length of string, -1 indicates original string. + :return: Trimmed string + """ + if string is None or not isinstance(string, str): + raise TypeError(f"Param string must be a string type but got {type(string)}.") + + if length <= -1 or len(string) <= length: + return string + + return string[:length] + + +class QueryGraphParser: + def __init__(self, rule_database_path: str): + self._fusion_rules: Dict[str, List[Tuple]] = dict() + self.load_database(rule_database_path) + self.num_rules = sum([len(v) for v in self._fusion_rules.values()]) + + @property + def fusion_rules(self): + return self._fusion_rules + + def load_database(self, rule_database): + if not os.path.isabs(rule_database): + rule_database = os.path.join(os.path.dirname(__file__), + "../", "../", + rule_database) + + if not os.path.exists(rule_database): + raise FileNotFoundError(f"Path {rule_database} does not exist.") + with open(rule_database, 'r') as f: + database = yaml.safe_load(f) + self.parse_yaml(database) + + def parse_yaml(self, yaml_database): + fusion_strategy_list = yaml_database.get("GraphFusion", []) + if yaml_database.get("UBFusion", []): + fusion_strategy_list.extend(yaml_database.get("UBFusion", [])) + for fusion_strategy in fusion_strategy_list: + if not isinstance(fusion_strategy, dict): + continue + (fusion_name, strategy), = fusion_strategy.items() + version = strategy.get("version", 0) + if version == 0 or version == "0": + self._fusion_rules[fusion_name] = self.build_query_graph_v0(fusion_name, + strategy.get('struct', [])) + elif version == 1 or version == "1": + self._fusion_rules[fusion_name] = self.build_query_graph_v1(fusion_name, + strategy.get('nodes', []), + strategy.get('edges', [])) + + @staticmethod + def build_query_graph_v0(graph_name: str, graph_struct: List[str]) -> List[Tuple]: + nodes = dict() + graphs = [] + edges = [] + + pre_node, next_node = None, None + for node in graph_struct: + pre_node = next_node + next_node = QueryGraphNode(node, graph_name) + nodes[next_node.op_name] = next_node + if pre_node is None or next_node is None: + continue + edges.append((pre_node, next_node,)) + graphs.append((nodes, edges, graph_name,)) + return graphs + + @staticmethod + def build_query_graph_v1(graph_name: str, + nodes_list: List[Dict], + edges_list: List[List[str]]) -> List[Tuple]: + graphs = [] + node_index = dict() + multi_node_list = [] + for index, node in enumerate(nodes_list): + (node_name, op_type), = node.items() + if isinstance(op_type, str): + op_type = [op_type] + multi_node_list.append([QueryGraphNode(op, graph_name) for op in op_type]) + node_index[node_name] = index + + multi_node = list(itertools.product(*multi_node_list)) + + for index, sub_nodes in enumerate(multi_node): + sub_graph_name = graph_name if index == 0 else f"{graph_name}#{index}" + sub_edge = [] + sub_node = dict() + for node in sub_nodes: + sub_node[node.op_name] = node + for edge in edges_list: + pre_node, next_node = edge + pre_node_index, next_node_index = node_index.get(pre_node), node_index.get(next_node) + sub_edge.append((sub_nodes[pre_node_index], sub_nodes[next_node_index])) + sub_graph = (sub_node, sub_edge, sub_graph_name,) + graphs.append(sub_graph) + return graphs diff --git a/profiler/advisor_review/common/profiling/__init__.py b/profiler/advisor_review/common/profiling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/common/profiling/ge_info.py b/profiler/advisor_review/common/profiling/ge_info.py new file mode 100644 index 0000000000000000000000000000000000000000..9996ec611a2a835bd8dffd24c3fbe7d8817ec29a --- /dev/null +++ b/profiler/advisor_review/common/profiling/ge_info.py @@ -0,0 +1,47 @@ +""" +DB +""" +import logging +import os +from typing import Any, List + +from sqlalchemy import text + +from profiler.advisor.dataset.profiling.db_manager import ConnectionManager +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser + +logger = logging.getLogger() + + +class GeInfo(ProfilingParser): + """ + ge info file + """ + FILE_PATTERN = r"ge_info.db" + FILE_PATTERN_MSG = "ge_info.db" + FILE_INFO = "ge info" + STATIC_OP_STATE = "0" + DYNAMIC_OP_STATE = "1" + + def __init__(self, path: str) -> None: + super().__init__(path) + self.op_state_info_list = None + + def parse_from_file(self, profiling_db_file): + """ + ge info + """ + db_path, db_file = os.path.split(profiling_db_file) + if not ConnectionManager.check_db_exists(db_path, [db_file]): + return False + conn = ConnectionManager(db_path, db_file) + if conn.check_table_exists(['TaskInfo']): + with conn().connect() as sql_conn: + self.op_state_info_list = sql_conn.execute(text("select op_name, op_state from TaskInfo")).fetchall() + return True + + def get_static_shape_operators(self) -> List[Any]: + return [op for op, state in self.op_state_info_list if state == self.STATIC_OP_STATE] + + def get_dynamic_shape_operators(self) -> List[Any]: + return [op for op, state in self.op_state_info_list if state == self.DYNAMIC_OP_STATE] diff --git a/profiler/advisor_review/common/profiling/msprof.py b/profiler/advisor_review/common/profiling/msprof.py new file mode 100644 index 0000000000000000000000000000000000000000..9453986b8225ccad68f2135d674e3832d987fcf0 --- /dev/null +++ b/profiler/advisor_review/common/profiling/msprof.py @@ -0,0 +1,144 @@ +""" +msprof +""" +import logging +from typing import Dict, List + +from profiler.advisor.dataset.profiling.info_collection import TaskInfo +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser + +logger = logging.getLogger() + + +class TaskChecker: + """ + check task info + """ + + def __init__(self): + self.sqe_keys = set() + + def is_sqe(self, task: TaskInfo) -> bool: + """check sqe""" + key = (task.pid, task.tid) + if task.args.get('name', '').endswith('_SQE'): + self.sqe_keys.add(key) + return False + + return key in self.sqe_keys + + +class Msprof(ProfilingParser): + """ + msprof + + """ + FILE_PATTERN = r"^msprof[_\d]+.json$" + FILE_PATTERN_MSG = "msprof_*.json" + FILE_INFO = "msprof" + + def __init__(self, path: str) -> None: + super().__init__(path) + self._tasks: List[TaskInfo] = [] + self._iteration_time = 0.0 + self._model_id = None + self._iteration_id = None + self._process_pid: Dict[str, str] = {} + self._min_time = 0.0 + self._max_time = 0.0 + self._data_process_time = 0.0 + self._start_point = 0.0 + + def parse_from_file(self, file: str): + if not self._parse_json(file): + return False + min_time = float('inf') + max_time = 0.0 + task_checker = TaskChecker() + is_iter = False + for item in self._raw_data: + task = TaskInfo(item) + if task.cat == "Iteration Time": + self._min_time = task.start_time + self._max_time = task.end_time + self._iteration_time = task.dur + is_iter = True + if task.cat == "Data_aug Bound" and "Data_aug Bound(us)" in task.args: + self._data_process_time = task.args["Data_aug Bound(us)"] + + if self._start_point == 0 and task.start_time > 0: + self._start_point = task.start_time + + if task_checker.is_sqe(task): + continue + + self._tasks.append(task) + self._parse_task(task) + + start_time = task.start_time + dur = task.dur + if start_time == -1 or dur == -1 or dur == 0: + continue + if start_time < min_time: + min_time = start_time + end_time = start_time + dur + if end_time > max_time: + max_time = end_time + if not is_iter: + self._iteration_time = dur + self._max_time = max_time + self._min_time = min_time + if self._tasks: + return True + return False + + def _parse_task(self, task): + if "Iteration Refresh" in task.name: + self._iteration_id = task.args.get("Iteration ID") + elif "Model ID" in task.name: + self._model_id = int(task.name.split(":")[1]) + elif "process_name" == task.name: + self._process_pid[task.args.get("name")] = task.pid + + @property + def step_time(self): + return self._iteration_time + self._data_process_time + + @property + def iteration_time(self): + return self._iteration_time + + @property + def iter_max_time(self): + return self._max_time + + @property + def iter_min_time(self): + return self._min_time + + @property + def data_process_time(self): + return self._data_process_time + + @property + def tasks(self): + return self._tasks + + @property + def model_id(self): + return self._model_id + + @property + def iteration_id(self): + return self._iteration_id + + @property + def process_pid(self): + return self._process_pid + + def __len__(self): + return len(self._tasks) + + @property + def start_point(self): + return self._start_point diff --git a/profiler/advisor_review/common/profiling/op_summary.py b/profiler/advisor_review/common/profiling/op_summary.py new file mode 100644 index 0000000000000000000000000000000000000000..d79439dbad8e2c105bed737c1a1c3be1a2cecfc1 --- /dev/null +++ b/profiler/advisor_review/common/profiling/op_summary.py @@ -0,0 +1,76 @@ +""" +summary +""" +import logging +from decimal import Decimal +from typing import List, Any + +from profiler.advisor.dataset.profiling.info_collection import OpInfo +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser +from profiler.advisor.utils.utils import format_excel_title, lazy_property + +logger = logging.getLogger() + + +class OpSummary(ProfilingParser): + """ + op summary + """ + + FILE_PATTERN = r"^op_summary_[_\d]+\.csv$" + FILE_PATTERN_MSG = "op_summary_*.csv" + FILE_INFO = "op summary" + STATIC_OP_STATE = "static" + DYNAMIC_OP_STATE = "dynamic" + + def __init__(self, path: str) -> None: + super().__init__(path) + self.op_list: List[OpInfo] = [] + self._total_task_duration = 0.0 + self._total_task_wait_time = 0.0 + self._raw_data: List[List[str]] = [] + + def parse_from_file(self, file: str): + if not self._parse_csv(file): + return False + title_dict = dict(enumerate(self._raw_data[0])) + for op_data in self._raw_data[1:]: + op_info = OpInfo() + for idx, value in enumerate(op_data): + title = title_dict.get(idx, "") + formatted_title = format_excel_title(title) + if formatted_title == 'task_start_time' and 'us' in title and \ + value.replace('.', '').replace("E+", "").isnumeric(): + value = str(Decimal(value) * Decimal(1000)) + op_info.add_attr(formatted_title, value) + self.op_list.append(op_info) + self._total_task_duration += self.get_float(op_info.get_attr("task_duration")) + self._total_task_wait_time += self.get_float(op_info.get_attr("task_wait_time")) + if not self.op_list: + logger.error("No valid op info in %s", file) + return False + return True + + def get_static_shape_operators(self) -> List[Any]: + return [op_info.get_attr("op_name") for op_info in self.op_list if op_info.get_attr("op_state") == self.STATIC_OP_STATE] + + def get_total_task_duration(self): + """ + get total task duration of all operators + :return: + """ + return self._total_task_duration + + @lazy_property + def task_dict(self): + """ + task dict + """ + task_dict = {} + for op_info in self.op_list: + if op_info.op_name not in task_dict: + task_dict[op_info.op_name] = [op_info] + else: + task_dict[op_info.op_name].append(op_info) + + return task_dict diff --git a/profiler/advisor_review/common/profiling/tasktime.py b/profiler/advisor_review/common/profiling/tasktime.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce09a783851e94163aa72f423788a373da5eb3a --- /dev/null +++ b/profiler/advisor_review/common/profiling/tasktime.py @@ -0,0 +1,75 @@ +""" +task time +""" +import logging +from typing import Dict, List + +from profiler.advisor.dataset.profiling.info_collection import TaskInfo +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser + +logger = logging.getLogger() + +AICPU_TASK_TYPE = "AI_CPU" +AICORE_TASK_TYPE = "AI_CORE" + + +class TaskTime(ProfilingParser): + """ + task time info + """ + + FILE_PATTERN = r"^task_time_[_\d]+\.json$" + FILE_PATTERN_MSG = "task_time*.json" + FILE_INFO = "task time" + + def __init__(self, path: str) -> None: + super().__init__(path) + self._tasks: List[TaskInfo] = [] + self._aicore_tasks: List[TaskInfo] = [] + self._aicpu_tasks: List[TaskInfo] = [] + self._process_map: Dict[str, str] = {} + self._pid_map: Dict[str, str] = {} + + def get_aicpu_tasks(self): + """ + get aicpu tasks + :return: aicpu tasks + """ + return self._aicpu_tasks + + def get_aicore_tasks(self): + """ + get aicore tasks + :return: aicore tasks + """ + return self._aicore_tasks + + def parse_from_file(self, file: str): + if not self._parse_json(file): + return False + for item in self._raw_data: + if item.get("ph") != "M": # header + continue + if item.get("name") != "process_name": + continue + pid = item.get("pid") + pname = item["args"]["name"] + self._process_map[pid] = pname + self._pid_map[pname] = pid + for item in self._raw_data: + if item.get("ph") == "M": # header + continue + task = TaskInfo(item) + self._tasks.append(task) + if task.pid != self._pid_map.get("Task Scheduler"): + continue + if task.task_type == AICORE_TASK_TYPE: + self._aicore_tasks.append(task) + elif task.task_type == AICPU_TASK_TYPE: + self._aicpu_tasks.append(task) + self._aicore_tasks.sort(key=lambda x: x.start_time) + self._aicpu_tasks.sort(key=lambda x: x.start_time) + if not self._tasks: + logger.error("No valid task info in %s", file) + return False + return True diff --git a/profiler/advisor_review/common/timeline/__init__.py b/profiler/advisor_review/common/timeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/common/timeline/event.py b/profiler/advisor_review/common/timeline/event.py new file mode 100644 index 0000000000000000000000000000000000000000..6001ac88722e5a77daba1c960e8ccfd6894889e6 --- /dev/null +++ b/profiler/advisor_review/common/timeline/event.py @@ -0,0 +1,23 @@ +class AdvisorDict(dict): + def __getstate__(self): + return self.__dict__ + + def __setstate__(self, d): + self.__dict__.update(d) + + def __getattr__(self, key: str): + if key not in self: + return {} + + value = self[key] + if isinstance(value, dict): + value = AdvisorDict(value) + return value + + +class TimelineEvent(AdvisorDict): + + def ts_include(self, event): + + return float(self.ts) <= float(event.ts) and float(self.ts) + float(self.dur) >= float(event.ts) + float( + event.dur) \ No newline at end of file diff --git a/profiler/advisor_review/common/timeline/fusion_ops_db.py b/profiler/advisor_review/common/timeline/fusion_ops_db.py new file mode 100644 index 0000000000000000000000000000000000000000..8637befd1ab108928bdf8f4fdb19d9cab03ff960 --- /dev/null +++ b/profiler/advisor_review/common/timeline/fusion_ops_db.py @@ -0,0 +1,269 @@ +import logging +import os + +import yaml + +from profiler.advisor.common import constant +from profiler.advisor.common.timeline.fusion_ops_rule import OpRule +from profiler.advisor.common.timeline.fusion_ops_rule_handler import TimelineOpRuleHandler +from profiler.advisor.utils.log import get_log_level +from profiler.advisor.utils.utils import get_file_path_by_walk + +logger = logging.getLogger() +logger.setLevel(get_log_level()) + + +def init_timeline_ops_db(cann_version=None, torch_version=None): + logger.debug("init operators database") + + return FusionOperatorDB(cann_version=cann_version, torch_version=torch_version) + + +def get_timeline_fusion_ops_yaml_path(): + # 环境变量 ADVISOR_RULE_PATH 不为空且该路径存在, os.walk遍历其下文件, 若存在相应的规则文件则返回路径 + advisor_rule_path = os.getenv(constant.ADVISOR_RULE_PATH) + if advisor_rule_path and os.path.exists(advisor_rule_path): + specified_file_path = get_file_path_by_walk(advisor_rule_path, constant.TIMELINE_FUSION_OPS_YAML_NAME) + if len(specified_file_path.strip()) and os.path.exists(specified_file_path): + logger.debug("Successfully find The %s file which is specified by the environment variable: %s.", + specified_file_path, constant.ADVISOR_RULE_PATH) + return specified_file_path + logger.warning("The %s does not exist in path: %s. Try to use cloud or default local YAML file.", + constant.TIMELINE_FUSION_OPS_YAML_NAME, os.path.normpath(advisor_rule_path)) + # 检查云文件默认保存路径文件夹下是否存在相应文件, 默认路径 ~/rules/cloud/ + cloud_file_path = os.path.join(os.path.expanduser("~"), constant.CLOUD_RULE_PATH, constant.TIMELINE_FUSION_OPS_YAML_NAME) + if os.path.exists(cloud_file_path): + logger.debug("Successfully find The cloud %s file in %s.", constant.TIMELINE_FUSION_OPS_YAML_NAME, + cloud_file_path) + return cloud_file_path + # 检查本地默认文件 + local_file_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + constant.DEFAULT_RULE_PATH, constant.TIMELINE_FUSION_OPS_YAML_NAME) + if not os.path.exists(local_file_path): + # 若本地默认文件不存在, 则log异常信息并 + logger.error("The default local YAML file does not exist. Please check the YAML file in the default path %s.", + local_file_path) + return local_file_path + + +class FusionOperatorDB: + + def __init__(self, file_path=None, cann_version=None, torch_version=None): + self.timeline_fusion_ops_yaml_path = os.path.normpath(get_timeline_fusion_ops_yaml_path()) + + self.cann_version = cann_version or constant.DEFAULT_CANN_VERSION + self.torch_version = torch_version or constant.DEFAULT_TORCH_VERSION + + self._supported_version_dict = {} + + self.is_empty = False + self.timeline_op_rule_handler = TimelineOpRuleHandler() + self.fusion_operator = self._load_yaml(self.timeline_fusion_ops_yaml_path) + + self._dequeue_op_names = [] + self._aten_op_names = [] + self._optimizer_op_names = [] + self._dequeue_op_api_map = {} + self._aten_op_api_map = {} + self._optimizer_op_api_map = {} + self._parse_db() + + @property + def dequeue_op_names(self): + return self._dequeue_op_names + + @property + def aten_op_names(self): + return self._aten_op_names + + @property + def optimizer_op_names(self): + return self._optimizer_op_names + + @property + def dequeue_op_api_map(self): + return self._dequeue_op_api_map + + @property + def aten_op_api_map(self): + return self._aten_op_api_map + + @property + def optimizer_op_api_map(self): + return self._optimizer_op_api_map + + def get_fusion_operator_with_unique_id(self, unique_id): + if unique_id == constant.TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID: + logger.warning("The specified unique id: %s is invalid.Please check whether the rule of the unique id " + "exists and modify the rule.", constant.TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID) + return {} + result_tmp_rule = self.timeline_op_rule_handler.get_tmp_timeline_op_rule_with_unique_id(unique_id) + result_op_rule = OpRule(result_tmp_rule) + return result_op_rule.get_final_rules() + + def regenerate_timeline_op_rule_with_unique_id(self, unique_id): + self.fusion_operator.clear() + logger.debug("Program try to regenerate the rule to version %s.", unique_id) + self.fusion_operator = self.get_fusion_operator_with_unique_id(unique_id) + self.regenerate_op_api_map_and_op_names() + + def regenerate_timeline_op_rule_with_version(self, cann_version=None, torch_version=None): + cann_version = cann_version or self.cann_version + torch_version = torch_version or self.torch_version + unique_id = self._get_unique_id_in_supported_version_dict(cann_version=cann_version, + torch_version=torch_version) + self.regenerate_timeline_op_rule_with_unique_id(unique_id) + + def regenerate_op_api_map_and_op_names(self): + self._dequeue_op_names.clear() + self._aten_op_names.clear() + self._optimizer_op_names.clear() + self._dequeue_op_api_map.clear() + self._aten_op_api_map.clear() + self._optimizer_op_api_map.clear() + self._parse_db() + + def _is_version_supported(self, db_content): + """校验当前版本是否被规则库中的版本支持, 保存版本支持信息数组, 按数组或字符串的可变方式保存""" + if db_content is None: + logger.warning( + "The rule library is empty. Check the rule library file: %s", + self.timeline_fusion_ops_yaml_path + ) + return False + for rule_dic in db_content: + if not isinstance(rule_dic, dict) or rule_dic.get("unique_id") is None: + continue + cann_version_list = rule_dic.get("cann_version") + torch_version_list = rule_dic.get("torch_version") + if not cann_version_list or not torch_version_list: + continue + supported_version = [cann_version_list, torch_version_list] + + unique_id = rule_dic.get("unique_id") + if unique_id < 0: + logger.warning( + "The unique id: %s of the rule should be a positive integer. " + "Please check and modify the rule configuration in the YAML file: %s.", + unique_id, os.path.normpath(self.timeline_fusion_ops_yaml_path) + ) + self._supported_version_dict[unique_id] = supported_version + + # 若解析timeline规则库的版本支持数组为空, 则存在问题 + if not self._supported_version_dict: + logger.warning( + "The rule library does not contain rules that support the current version. " + "Check the rule library file: %s", + self.timeline_fusion_ops_yaml_path + ) + return False + + # 检验当前版本是否被规则库支持 + is_version_supported = self._is_version_supported_in_supported_version_dict() + if not is_version_supported: + # 若规则库不支持当前版本, 则log警告信息 + logger.warning("Unsupported versions: cann-%s and torch-%s, supported version list of ['cann', 'torch'] " + "is %s", self.cann_version, self.torch_version, self._supported_version_dict.values()) + return is_version_supported + + def _is_version_supported_in_supported_version_dict(self, cann_version=None, torch_version=None): + """校验当前版本是否存在在规则库中的版本支持字典中""" + for _, supported_version in self._supported_version_dict.items(): + if self._is_version_supported_in_versions(supported_version, cann_version, torch_version): + return True + return False + + def _get_unique_id_in_supported_version_dict(self, cann_version=None, torch_version=None) -> int: + """校验当前版本是否存在在规则库中的版本支持字典中, 在使用前请检查是否支持该版本""" + for key_unique_id, supported_version in self._supported_version_dict.items(): + if self._is_version_supported_in_versions(supported_version, cann_version, torch_version): + return key_unique_id + return constant.TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID + + def _is_version_supported_in_versions(self, supported_version, cann_version=None, torch_version=None): + """校验当前cann版本和torch版本是否存在在规则库中的版本支持数组的元素中""" + cann_version_list = supported_version[0] + if not isinstance(cann_version_list, list): + cann_version_list = [cann_version_list] + + torch_version_list = supported_version[1] + if not isinstance(torch_version_list, list): + torch_version_list = [torch_version_list] + + cann_version = cann_version or self.cann_version + torch_version = torch_version or self.torch_version + + if (cann_version in cann_version_list) and (torch_version in torch_version_list): + return True + return False + + def _parse_db(self): + """生成输出的规则库""" + self._parse(constant.ATEN) + self._parse(constant.DEQUEUE) + self._parse(constant.OPTIMIZER) + + def _parse(self, mode): + """生成输出的规则库中指定部分, 如aten, Optimizer等""" + op_info = self.fusion_operator.get(mode, []) or [] + for ops in op_info: + for npu_api, op_combined in ops.items(): + if not isinstance(op_combined, list): + self._parse_in_list(mode, op_combined, npu_api) + for _op_combined in op_combined: + self._parse_in_list(mode, _op_combined, npu_api) + + def _parse_in_list(self, mode, op_combined, npu_api): + """生成输出的规则库中具体部分, 如{silu: torch_npu.npu_silu/torch_npu.contrib.module.SiLU}等""" + if not isinstance(op_combined, str): + logger.warning("Error type in yaml: %s", op_combined) + return + mode_str = mode.lower() + getattr(self, f"{mode_str}_op_names", []).extend(op_combined.split("-")) + + new_npu_api = npu_api + pre_npu_api = getattr(self, f"{mode_str}_op_api_map", {}).get(op_combined) + if pre_npu_api: + new_npu_api = f"{pre_npu_api}/{npu_api}" + getattr(self, f"{mode_str}_op_api_map", {})[op_combined] = new_npu_api + logger.debug("Output rule: %s: %s: %s: %s ", mode, op_combined, new_npu_api, op_combined.split("-")) + + def _load_yaml(self, file_path): + """生成timeline规则库""" + logger.debug("Try to use the following yaml file as timeline ops rule: %s.", os.path.abspath(file_path)) + # 若文件不存在,则报错, 并返回空字典 + if not os.path.exists(file_path): + logger.warning("Path: '%s' does not exist, please specific existed path of " + "fusion operators yaml file by setting env '%s'", + os.path.abspath(file_path), constant.ADVISOR_RULE_PATH) + self.is_empty = True + return {} + + logger.debug("The rule yaml file is successfully found in path: %s", os.path.abspath(file_path)) + + with open(file_path, "rb") as file: + db_content = yaml.safe_load(file) + + if not self._is_version_supported(db_content): + self.is_empty = True + return {} + + logger.debug("The rule library supports the current environment version.") + + # 获取所有版本timeline规则库 + self.timeline_op_rule_handler.set_db_content(db_content) + + # 获取所需版本规则 + unique_id = self._get_unique_id_in_supported_version_dict() + logger.debug("Program is using version %s of the rule.", unique_id) + result_op_rule = self.get_fusion_operator_with_unique_id(unique_id) + if result_op_rule and len(result_op_rule) > 0: + return result_op_rule + + logger.warning( + "Failed to load fusion operators database, skip analyze timeline for affinity api," + " please refer to database yaml %s to customize your yaml.", + self.timeline_fusion_ops_yaml_path + ) + self.is_empty = True + return {} diff --git a/profiler/advisor_review/common/timeline/fusion_ops_rule.py b/profiler/advisor_review/common/timeline/fusion_ops_rule.py new file mode 100644 index 0000000000000000000000000000000000000000..deee68edb9a92d0588f3f3c155a7b2595317a5c7 --- /dev/null +++ b/profiler/advisor_review/common/timeline/fusion_ops_rule.py @@ -0,0 +1,110 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. +import copy +import logging + +from profiler.advisor.utils.log import get_log_level + +logger = logging.getLogger() +logger.setLevel(get_log_level()) + + +class OpRule: + + def __init__(self, rule=None, timeline_op_rule_handler=None): + if rule is None: + self._tmp_rule = {} + else: + self._tmp_rule = copy.deepcopy(rule) + if timeline_op_rule_handler is None: + self.timeline_op_rule_handler = {} + else: + self.timeline_op_rule_handler = copy.deepcopy(timeline_op_rule_handler) + self._rule = {} + + @property + def tmp_rule(self): + return self._tmp_rule + + @staticmethod + def _format_rule(rule): + """格式化规则函数, 将额外规则格式化为{key,数组list}形式, 使得yaml文件中operator_rules若写成key:str形式也能正常读取""" + format_rule = {} + for key, val in rule.items(): + if not isinstance(val, list): + val = [val] + format_rule[key] = val + return format_rule + + def merge(self, extra_rule): + """合并函数, 将已有规则库与额外规则合并, 若无继承则已有规则库应为空""" + for key, val in extra_rule.items(): + for func, op_rules in val.items(): + try: + getattr(self, f"{func}")(key, op_rules) + except AttributeError: + logger.error("Undefined field and function name. Ensure that %s is correct in the rule " + "library.", func) + + def get_final_rules(self): + """获取最终的规则库""" + self._restore_rule() + return self._rule + + def add(self, key, add_rules: dict): + """新增函数, 新增已有规则库不存在的额外规则""" + if add_rules is None: + return + if self._tmp_rule.get(key) is None: + self._tmp_rule[key] = {} + format_add_rule = self._format_rule(add_rules) + for add_key, add_val in format_add_rule.items(): + logger.debug("add: %s: %s", add_key, add_val) + if add_key not in self._tmp_rule: + self._tmp_rule[key][add_key] = add_val + else: + logger.warning("This key has been written to the rule, " + "%s: %s should be written in the overwrite section", add_key, add_val) + self._tmp_rule[key][add_key].update(add_val) + + def overwrite(self, key, overwrite_rules: dict): + """重写函数, 重写已有规则库中已经存在的规则""" + if overwrite_rules is None: + return + if self._tmp_rule.get(key) is None: + self._tmp_rule[key] = {} + format_overwrite_rules = self._format_rule(overwrite_rules) + for overwrite_key, overwrite_val in format_overwrite_rules.items(): + logger.debug("overwrite: %s: %s", overwrite_key, overwrite_val) + if overwrite_key not in self._tmp_rule: + logger.warning("This key is not written to the rule. " + "%s: %s should be written in the add section", overwrite_key, overwrite_val) + self._tmp_rule[key][overwrite_key] = overwrite_val + else: + self._tmp_rule[key][overwrite_key].update(overwrite_val) + + def exclude(self, key, exclude_rules: list): + """除外函数, 将已有规则库已有的规则除外删除""" + if exclude_rules is None: + return + for exclude_key in exclude_rules: + logger.debug("exclude: %s", exclude_key) + if isinstance(exclude_key, str): + if exclude_key not in self._tmp_rule[key]: + logger.warning("This key is not written to the rule. " + "do not need to exclude: %s.", exclude_key) + continue + self._tmp_rule[key].pop(exclude_key) + else: + logger.warning("Error type rule in exclude: %s", exclude_key) + + def inherit_unique_id(self, key, inherit_unique_id): + """局部继承函数, 将规则库中指定unique_id版本覆盖指定位置""" + result_rule = self.timeline_op_rule_handler.get_tmp_timeline_op_rule_with_unique_id(inherit_unique_id) + if result_rule is not None and result_rule.get(key) is not None: + self._tmp_rule[key] = copy.deepcopy(result_rule.get(key)) + return + logger.error("Rule library version %s does not exist. ", inherit_unique_id) + + def _restore_rule(self): + for key, op_api_map in self._tmp_rule.items(): + self._rule[key] = [{op_combined: api} for op_combined, api in op_api_map.items()] diff --git a/profiler/advisor_review/common/timeline/fusion_ops_rule_handler.py b/profiler/advisor_review/common/timeline/fusion_ops_rule_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..b0558cca6d951ee057e538b5e4da6d9c2e78111b --- /dev/null +++ b/profiler/advisor_review/common/timeline/fusion_ops_rule_handler.py @@ -0,0 +1,193 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. +import copy +import logging + +from profiler.advisor.common import constant +from profiler.advisor.common.timeline.fusion_ops_rule import OpRule +from profiler.advisor.utils.log import get_log_level + +logger = logging.getLogger() +logger.setLevel(get_log_level()) + + +class TimelineOpRuleHandler: + """基于线性规划思想保存OpRule,用于局部继承、全局继承等功能""" + + def __init__(self): + self._db_content = None + # 具体生成的timeline规则,key为unique_id + self._all_tmp_timeline_op_rule = {} + # 所有timeline规则的dict集合,key为unique_id + self._all_origin_timeline_op_rule_dict = {} + # 已生成timeline规则的id数组 + self._exist_timeline_op_rule_unique_id_list = [] + + @staticmethod + def _get_local_inherit_id_list(op_rule: dict): + local_inherit_id_list = [] + for _, val in op_rule.items(): + if val.get("inherit_unique_id") is not None: + local_inherit_id_list.append(val.get("inherit_unique_id")) + return local_inherit_id_list + + @staticmethod + def _is_duplicated_element_in_lists(list_a, list_b): + """检查两个数组中是否存在重复的元素,若有任意元素重复,返回True""" + if not isinstance(list_a, list): + list_a = [list_a] + if not isinstance(list_b, list): + list_b = [list_b] + # 将两个数组合并为一个列表,使用集合(set)判断列表中是否存在重复元素 + combined_list = list_a + list_b + if len(combined_list) != len(set(combined_list)): + return True + return False + + def set_db_content(self, db_content): + # 过滤非 dict 格式, 或 dict 中没有定义 unique_id 的数据, 并保存到 _all_origin_timeline_op_rule_dict 中 + self._db_content = copy.deepcopy(db_content) + for rule_dic in self._db_content: + if not isinstance(rule_dic, dict) or rule_dic.get("unique_id") is None: + continue + self._all_origin_timeline_op_rule_dict[rule_dic.get("unique_id")] = rule_dic + if self._all_origin_timeline_op_rule_dict: + self.generate_all_timeline_op_rule() + + def generate_basic_timeline_op_rules(self): + """用于实现获取无全局继承规则, 无全局继承的规则认为是基础版本规则, 默认不会存在局部继承""" + for _, rule_dic in self._all_origin_timeline_op_rule_dict.items(): + if rule_dic.get("inherit_unique_id") is None: + self.add_basic_timeline_op_rule(rule_dic) + + def add_basic_timeline_op_rule(self, rule_dic): + # 若基础规则中存在局部继承的规则,则跳过 + local_inherit_id_list = self._get_local_inherit_id_list(rule_dic.get("operator_rules")) + if local_inherit_id_list: + return + + temp_rule = OpRule() + temp_rule.merge(rule_dic.get("operator_rules")) + + unique_id = rule_dic.get("unique_id") + logger.debug("The rule of version %s is basic rule.", unique_id) + self.add_new_timeline_op_rule(unique_id, temp_rule.tmp_rule) + + def add_empty_timeline_op_rule(self, unique_id): + if self._all_origin_timeline_op_rule_dict.get(unique_id) is None: + self._all_origin_timeline_op_rule_dict[unique_id] = {} + tmp_rule = {} + logger.debug("The rule of version %s is empty.", unique_id) + self.add_new_timeline_op_rule(unique_id, tmp_rule) + + def add_new_timeline_op_rule(self, unique_id, tmp_rule): + if unique_id not in self._exist_timeline_op_rule_unique_id_list: + self._exist_timeline_op_rule_unique_id_list.append(unique_id) + self._all_tmp_timeline_op_rule[unique_id] = tmp_rule + logger.debug("The rule of version %s is successfully generated.", unique_id) + + def generate_specified_list_timeline_op_rule(self, specified_unique_id_list, kid_id_list=None): + for specified_unique_id in specified_unique_id_list: + if specified_unique_id in self._exist_timeline_op_rule_unique_id_list: + self.generate_specified_timeline_op_rule(specified_unique_id, kid_id_list) + + def generate_specified_timeline_op_rule(self, specified_unique_id, kid_id_list=None): + """用于实现生成特定版本规则 + + 若不存在相应specified_unique_id的规则、或是已生成、循环继承等情况,将该规则置空并返回 + 规则库文件结构设置为多叉树, 结构决定了不断向下搜索最终应该是从基础版本开始继承, 递归生成, + 直到specified_unique_id规则依赖继承的规则库全部生成完毕, 再生成该指定规则库, 将specified_unique_id的规则库归档 + + 参数: + specified_unique_id: 指定版本规则id + kid_id_list: 子规则id数组, 用于防止循环继承, 如间接继承自身或直接继承自身等情况 + 返回: + None + """ + if kid_id_list is None: + kid_id_list = [] + + # 若该unique_id规则在timeline_fusion_ops.yaml中没有相应的规则, 生成该id规则,置为空 + if self._all_origin_timeline_op_rule_dict.get(specified_unique_id) is None: + logger.warning("The specified version %s does not exist in the rule library. " + "Ensure that the corresponding rule is configured in the YAML file. " + "The version %s is left blank.", + specified_unique_id, + specified_unique_id) + self.add_empty_timeline_op_rule(specified_unique_id) + return + + # 若该unique_id规则已经生成,则无需再次生成 + if specified_unique_id in self._exist_timeline_op_rule_unique_id_list: + logger.warning("The rule has been generated and does not need to be generated again. " + "Check whether unique id %s in the YAML file is duplicate.", + specified_unique_id) + return + + # 若kid_id_list不为空,且间接继承自身,则尝试生成空规则用于继承 + if kid_id_list and self._is_duplicated_element_in_lists(specified_unique_id, kid_id_list): + logger.warning("It cannot be inherited indirectly. Ensure that the corresponding rules are correctly " + "configured in the YAML file and leave Version %s blank.", + specified_unique_id) + self.add_empty_timeline_op_rule(specified_unique_id) + return + + rule_dic = self._all_origin_timeline_op_rule_dict.get(specified_unique_id) + if rule_dic is not None: + kid_id_list.append(specified_unique_id) + + global_inherit_id = rule_dic.get("inherit_unique_id") + if global_inherit_id and global_inherit_id not in self._exist_timeline_op_rule_unique_id_list: + logger.debug("The rule of version %s global inherit the rule of version %s", + specified_unique_id, global_inherit_id) + self.generate_specified_timeline_op_rule(global_inherit_id, kid_id_list) + + # 若局部继承的规则未生成, 生成该规则 + local_inherit_id_list = self._get_local_inherit_id_list(rule_dic.get("operator_rules")) + if local_inherit_id_list: + logger.debug("The rule of version %s local inherit the rule of version %s", + specified_unique_id, local_inherit_id_list) + self.generate_specified_list_timeline_op_rule(specified_unique_id_list=local_inherit_id_list, + kid_id_list=kid_id_list) + logger.debug("Start to generate rule of version %s", specified_unique_id) + # 实现全局继承与局部继承 + temp_rule = OpRule(timeline_op_rule_handler=self, + rule=self._all_tmp_timeline_op_rule.get(global_inherit_id)) + temp_rule.merge(rule_dic.get("operator_rules")) + # 将生成的规则归档保存 + self.add_new_timeline_op_rule(specified_unique_id, temp_rule.tmp_rule) + return + logger.error("Failed to generate the rule whose unique_id is %s. Ensure that the rule is configured in " + "the YAML file and the version %s is empty.", specified_unique_id, specified_unique_id) + self.add_empty_timeline_op_rule(specified_unique_id) + + def generate_all_timeline_op_rule(self): + """用于实现获取所有版本规则 + + 查找db_content中的规则库, 规则库文件结构设置为多叉树, 优先生成无继承的基础规则版本 + 循环并生成其他版本, 文件结构决定了不断向下搜索最终应该是从基础版本开始继承, 递归生成,直到全部规则库生成后退出函数 + + 参数: + None + 返回: + None + """ + self.generate_basic_timeline_op_rules() + _unique_id_list = copy.deepcopy(list(self._all_origin_timeline_op_rule_dict.keys())) + for unique_id in _unique_id_list: + if unique_id in self._exist_timeline_op_rule_unique_id_list: + continue + self.generate_specified_timeline_op_rule(unique_id) + + def get_tmp_timeline_op_rule_with_unique_id(self, unique_id): + if unique_id not in self._exist_timeline_op_rule_unique_id_list: + logger.error("The specified unique_id does not exist in the rule library. Ensure that the " + "corresponding rule is configured in the YAML file and the version %s is empty." + "If the value of unique_id is a negative number, the version may not be supported.", + unique_id) + self.add_empty_timeline_op_rule(unique_id) + if unique_id < 0: + logger.error("Advise to use a positive integer as the unique id of rules. " + "Negative numbers: %s are not recommended to use as unique id. " + "If specified invalid unique id: %s is used, an empty rule is returned by default.", + unique_id, constant.TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID) + return self._all_tmp_timeline_op_rule.get(unique_id) diff --git a/profiler/advisor_review/common/version_control.py b/profiler/advisor_review/common/version_control.py new file mode 100644 index 0000000000000000000000000000000000000000..38b054543fc61e90d91e8442a547376cff4c6406 --- /dev/null +++ b/profiler/advisor_review/common/version_control.py @@ -0,0 +1,26 @@ +import logging +from typing import List + +logger = logging.getLogger() + + +class VersionControl: + _SUPPORT_VERSIONS = [] + + @classmethod + def is_supported(cls, cann_version: str) -> bool: + """ + Check whether the CANN software version is supported, which can be viewed by executing the following command: + 'cat /usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info' + """ + flag = (cls._SUPPORT_VERSIONS.__contains__(cann_version)) + if not flag: + logger.debug("class type is %s, which is not support current CANN version %s", cls.__name__, cann_version) + return flag + + def get_support_version(self) -> List[str]: + """ + Acquire the CANN software version + :return: supported CANN software version + """ + return self._SUPPORT_VERSIONS diff --git a/profiler/advisor_review/computation_analysis.ipynb b/profiler/advisor_review/computation_analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..15d8618bd9f32dccbee214c0a79f2be6863314cb --- /dev/null +++ b/profiler/advisor_review/computation_analysis.ipynb @@ -0,0 +1,748 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"../..\")\n", + "\n", + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Block Dim问题识别\n", + "\n", + "Block Dim问题主要为识别相关core算子AI core核未打满或者Vector 核未打满问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Block Dim问题.\n", + "\n", + "下列代码为样例,主要展示如何检测Block Dim类型问题,并获取相关问题检测结果:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# 查询computation相关是否存在block dim问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "block_dim_result = interface.get_result(\"computation\", \"block_dim_analysis\", cann_version=\"7.0.RC1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core; Top-10
operator of task duration are as follows: Square, MatMulV2, BatchMatMul,
SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
101814.01999999999991.0
" + ], + "text/plain": [ + "+-----------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-----------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | operator of task duration are as follows: Square, MatMulV2, BatchMatMul, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-----------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = block_dim_result.get(\"problems\")\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(problems.get(\"headers\"))\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " problem_table.add_row(row)\n", + " \n", + " problem_table.align = \"l\"\n", + " problem_table.hrules = ALL\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to block dim.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_typetask_durationincomeblock_dimmix_block_diminput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formats
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm-
LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35
SquareAI_VECTOR_CORE42.760160"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm-
LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78
SquareAI_VECTOR_CORE42.240160"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/lm_head-Linear/MatMul-op213MatMulV2AI_CORE39.020200"128,128;128,32000"FLOAT16;FLOAT16FORMAT_ND;FORMAT_ND"128,32000"FLOATFORMAT_ND
" + ], + "text/plain": [ + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| op_name | op_type | task_type | task_duration | income | block_dim | mix_block_dim | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- | Square | AI_VECTOR_CORE | 42.76 | 0 | 16 | 0 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35 | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- | Square | AI_VECTOR_CORE | 42.24 | 0 | 16 | 0 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78 | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/lm_head-Linear/MatMul-op213 | MatMulV2 | AI_CORE | 39.02 | 0 | 20 | 0 | \"128,128;128,32000\" | FLOAT16;FLOAT16 | FORMAT_ND;FORMAT_ND | \"128,32000\" | FLOAT | FORMAT_ND |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " block_dim = block_dim_result.get(\"block dim\")\n", + " block_dim_table = PrettyTable(block_dim.get(\"headers\"))\n", + " for row in block_dim.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " block_dim_table.add_row(row)\n", + "\n", + " block_dim_table.hrules = ALL\n", + " display(block_dim_table[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operator No Bound问题识别\n", + "Operator No Bound问题主要为识别相关算子无mte, cube, vector, scalar相关bound问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Operator No Bound问题.\n", + "\n", + "下列代码为样例,主要展示如何检测Operator No Bound类型问题,并获取相关问题检测结果:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface\n", + "\n", + "\n", + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# 查询computation相关是否存在operator no bound问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "operator_no_bound_result = interface.get_result(\"computation\", \"operator_no_bound_analysis\", cann_version=\"7.0.RC1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core; Top-10
operator of task duration are as follows: Square, MatMulV2, BatchMatMul,
SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
101814.01999999999991.0
operator no boundThere is no mte, cube, vector, scalar ratio is more than 80.00%; Top task
duration operators need to be tuned are as follows: Square, MatMulV2,
BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
95814.01999999999990.7985
" + ], + "text/plain": [ + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | operator of task duration are as follows: Square, MatMulV2, BatchMatMul, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| operator no bound | There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 95 | 814.0199999999999 | 0.7985 | | |\n", + "| | duration operators need to be tuned are as follows: Square, MatMulV2, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = operator_no_bound_result.get(\"problems\")\n", + "problem_table = PrettyTable(problems.get(\"headers\"))\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " problem_table.add_row(row)\n", + "\n", + " problem_table.align = \"l\"\n", + " problem_table.hrules = ALL\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to operator no bound.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_typetask_durationvec_ratiomac_ratioscalar_ratiomte1_ratiomte2_ratiomte3_ratioblock_diminput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formats
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm-
LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35
SquareAI_VECTOR_CORE42.760.465400000.005616"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm-
LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78
SquareAI_VECTOR_CORE42.240.46600000.006216"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/lm_head-Linear/MatMul-op213MatMulV2AI_CORE39.0200.11050.01190.08570.4284020"128,128;128,32000"FLOAT16;FLOAT16FORMAT_ND;FORMAT_ND"128,32000"FLOATFORMAT_ND
" + ], + "text/plain": [ + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| op_name | op_type | task_type | task_duration | vec_ratio | mac_ratio | scalar_ratio | mte1_ratio | mte2_ratio | mte3_ratio | block_dim | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- | Square | AI_VECTOR_CORE | 42.76 | 0.4654 | 0 | 0 | 0 | 0 | 0.0056 | 16 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- | | | | | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35 | | | | | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- | Square | AI_VECTOR_CORE | 42.24 | 0.466 | 0 | 0 | 0 | 0 | 0.0062 | 16 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- | | | | | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78 | | | | | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/lm_head-Linear/MatMul-op213 | MatMulV2 | AI_CORE | 39.02 | 0 | 0.1105 | 0.0119 | 0.0857 | 0.4284 | 0 | 20 | \"128,128;128,32000\" | FLOAT16;FLOAT16 | FORMAT_ND;FORMAT_ND | \"128,32000\" | FLOAT | FORMAT_ND |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " operator_no_bound = operator_no_bound_result.get(\"operator no bound\")\n", + " operator_no_bound_table = PrettyTable(operator_no_bound.get(\"headers\"))\n", + " for row in operator_no_bound.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " operator_no_bound_table.add_row(row)\n", + " operator_no_bound_table.hrules = ALL\n", + " display(operator_no_bound_table[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AICPU问题识别\n", + "AICPU问题主要为识别相关算子执行时跑到AICPU上计算,并没有利用到AI CORE的计算能力的场景,主要调优手段为修改相关代码来避免AICPU算子,可参见相关资料,来避免AICPU算子的问题:\n", + "https://support.huaweicloud.com/bestpractice-modelarts/modelarts_10_2517.html\n", + "\n", + "下列代码为样例,主要展示如何检测Dynamic Shape类型问题,并获取相关问题检测结果:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface\n", + "\n", + "\n", + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Please ensure only one trace_view.json in C:\\personalC\\profiling_data, there will analyze first timeline profiling data.\n", + " \r" + ] + } + ], + "source": [ + "# 查询computation相关是否存在aicpu问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "aicpu_result = interface.get_result(\"computation\", \"aicpu_analysis\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core; Top-10
operator of task duration are as follows: Square, MatMulV2, BatchMatMul,
SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
101814.01999999999991.0
operator no boundThere is no mte, cube, vector, scalar ratio is more than 80.00%; Top task
duration operators need to be tuned are as follows: Square, MatMulV2,
BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
95814.01999999999990.7985
AICPU operatorSome operators and task duration exceed 20 us, such as : Cast1. Modify code to avoid aicpu operator39686568.8600000010.0189
" + ], + "text/plain": [ + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | operator of task duration are as follows: Square, MatMulV2, BatchMatMul, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| operator no bound | There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 95 | 814.0199999999999 | 0.7985 | | |\n", + "| | duration operators need to be tuned are as follows: Square, MatMulV2, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| AICPU operator | Some operators and task duration exceed 20 us, such as : Cast | 1. Modify code to avoid aicpu operator | 39 | 686568.860000001 | 0.0189 | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = aicpu_result.get(\"problems\")\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(problems.get(\"headers\"))\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " problem_table.add_row(row)\n", + "\n", + " problem_table.align = \"l\"\n", + " problem_table.hrules = ALL\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to operator no bound.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_durationinput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formatsstack_info
trans_Cast_5Cast493.64""INT32FORMAT_ND""UINT64FORMAT_ND/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279):
dropout; /usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/dropout.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/language_model.py(236): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/language_model.py(425): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/module.py(184): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/distributed.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
../../pretrain_gpt.py(88): forward_step;
/profiling_auto_GPT3/megatron/schedules.py(118): forward_step;
/home/s30040711/Megatron-
LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(96):
forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419):
train_step; /profiling_auto_GPT3/megatron/training.py(837): train;
/profiling_auto_GPT3/megatron/training.py(152): pretrain;
../../pretrain_gpt.py(122): <module>
trans_Cast_5Cast413.4""INT32FORMAT_ND""UINT64FORMAT_ND/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279):
dropout; /usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/dropout.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/language_model.py(236): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/language_model.py(425): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/module.py(184): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/distributed.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
../../pretrain_gpt.py(88): forward_step;
/profiling_auto_GPT3/megatron/schedules.py(118): forward_step;
/home/s30040711/Megatron-
LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(109):
forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419):
train_step; /profiling_auto_GPT3/megatron/training.py(837): train;
/profiling_auto_GPT3/megatron/training.py(152): pretrain;
../../pretrain_gpt.py(122): <module>
" + ], + "text/plain": [ + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------------------------------------+\n", + "| op_name | op_type | task_duration | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats | stack_info |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------------------------------------+\n", + "| trans_Cast_5 | Cast | 493.64 | \"\" | INT32 | FORMAT_ND | \"\" | UINT64 | FORMAT_ND | /usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): |\n", + "| | | | | | | | | | dropout; /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/dropout.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/module.py(184): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | ../../pretrain_gpt.py(88): forward_step; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; |\n", + "| | | | | | | | | | /home/s30040711/Megatron- |\n", + "| | | | | | | | | | LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(96): |\n", + "| | | | | | | | | | forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): |\n", + "| | | | | | | | | | train_step; /profiling_auto_GPT3/megatron/training.py(837): train; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(152): pretrain; |\n", + "| | | | | | | | | | ../../pretrain_gpt.py(122): |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------------------------------------+\n", + "| trans_Cast_5 | Cast | 413.4 | \"\" | INT32 | FORMAT_ND | \"\" | UINT64 | FORMAT_ND | /usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): |\n", + "| | | | | | | | | | dropout; /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/dropout.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/module.py(184): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | ../../pretrain_gpt.py(88): forward_step; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; |\n", + "| | | | | | | | | | /home/s30040711/Megatron- |\n", + "| | | | | | | | | | LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(109): |\n", + "| | | | | | | | | | forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): |\n", + "| | | | | | | | | | train_step; /profiling_auto_GPT3/megatron/training.py(837): train; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(152): pretrain; |\n", + "| | | | | | | | | | ../../pretrain_gpt.py(122): |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------------------------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " aicpu = aicpu_result.get(\"AICPU operator\")\n", + " aicpu_table = PrettyTable(aicpu.get(\"headers\"))\n", + " for row in aicpu.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " aicpu_table.add_row(row)\n", + " aicpu_table.hrules = ALL\n", + " display(aicpu_table[:2])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/advisor_review/config/__init__.py b/profiler/advisor_review/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/config/config.ini b/profiler/advisor_review/config/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..c56c1dad9f0d7e9ac02ab76b0e79e102b010da12 --- /dev/null +++ b/profiler/advisor_review/config/config.ini @@ -0,0 +1,16 @@ +[LOG] +# console_logging_level : DEBUG/INFO/WARNING/ERROR +console_logging_level = INFO +[ANALYSE] +# analysis_result_file : filename of analysis result +analysis_result_file = analysis_result_file.xlsx +# tune_ops_file: filename of tune op name list +tune_ops_file = operator_tuning_file.cfg +[THRESHOLD] +# operator_bound_ratio: (mte, cube, vector, scalar) ratio greater than this value will be checked in operator_bound_checker +operator_bound_ratio = 0.8 +[RULE-BUCKET] +# region : URL of different regions where can download rule yaml file +cn-north-9 = cnnorth9-modelarts-sdk +cn-southwest-2 = cnsouthwest2-modelarts-sdk +cn-north-7 = cnnorth7-modelarts-sdk \ No newline at end of file diff --git a/profiler/advisor_review/config/config.py b/profiler/advisor_review/config/config.py new file mode 100644 index 0000000000000000000000000000000000000000..12f4526f8c95a747f97272aed6cf8e4e822da676 --- /dev/null +++ b/profiler/advisor_review/config/config.py @@ -0,0 +1,108 @@ +""" +advisor config +""" +from profiler.advisor.utils.utils import Timer + +import logging +import os +from configparser import ConfigParser + +from profiler.advisor.utils.utils import singleton + +logger = logging.getLogger() + + +@singleton +class Config: + """ + config + """ + # pylint: disable=too-many-instance-attributes + + _CONFIG_DIR_NAME = "config" + _CONFIG_FILE_NAME = "config.ini" + + def __init__(self) -> None: + config = ConfigParser(allow_no_value=True) + self._work_path = os.getcwd() # pwd + self._root_path = os.path.abspath(os.path.join(__file__, "../../")) + config.read(os.path.join(self._root_path, self._CONFIG_DIR_NAME, self._CONFIG_FILE_NAME)) + self.config = config + # ANALYSE + self._analysis_result_file = self._normalize_path(config.get("ANALYSE", "analysis_result_file")) + self._tune_ops_file = os.path.abspath( + os.path.join(self._work_path, f"operator_tuning_file_{Timer().strftime}.cfg")) + self.log_path = None + + def _normalize_path(self, file) -> str: + if not file.startswith("/"): + file = os.path.join(self._work_path, file) + return os.path.abspath(file) + + @property + def work_path(self) -> str: + """ + get work path + :return: work path + """ + return self._work_path + + @property + def root_path(self) -> str: + """ + get root path + :return: root path + """ + return self._root_path + + def set_config(self, key, value) -> None: + """ + set config value + :param key: config key + :param value: config value + """ + setattr(self, key, value) + + def get_config(self, key) -> str: + """ + get value of config + :param key: config key + :return: config value + """ + try: + return getattr(self, key) + except AttributeError: + return "" + + @property + def analysis_result_file(self) -> str: + """ + get filename of op result file + :return: filename + """ + return self._analysis_result_file + + @property + def tune_ops_file(self) -> str: + """ + get filename of tune op file + :return: filename + """ + return self._tune_ops_file + + @property + def operator_bound_ratio(self) -> float: + """ + operator_bound_ratio + """ + return float(self.config.get("THRESHOLD", "operator_bound_ratio")) + + def set_log_path(self, result_file: str, log_path: str = None): + self.log_path = log_path if log_path is not None else os.path.join(self._work_path, "log") + os.makedirs(self.log_path, exist_ok=True) + self.config._analysis_result_file = os.path.join(self.log_path, result_file) + self._analysis_result_file = os.path.join(self.log_path, result_file) + + def remove_log(self): + if self.log_path and os.path.isdir(self.log_path) and not os.listdir(self.log_path): + os.rmdir(self.log_path) diff --git a/profiler/advisor_review/config/profiling_data_version_config.yaml b/profiler/advisor_review/config/profiling_data_version_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f73aecd3baf18e06981ef4d4b0db7d6faadd419a --- /dev/null +++ b/profiler/advisor_review/config/profiling_data_version_config.yaml @@ -0,0 +1,80 @@ +versions: + - version: 8.0.0 + dirs_pattern: + ^PROF_\d{6}_\d{17}_\w+$: + mindstudio_profiler_output: + [ op_summary, msprof ] + class_attr: + op_summary: OpSummary + msprof: Msprof + file_attr: + op_summary: ^op_summary_\d{14}\.csv$ + msprof: ^msprof_\d{14}\.json$ + + - version: 7.0.0 + dirs_pattern: + ^PROF_\d{6}_\d{17}_\w+$: + ^device_\d+$: + summary: + [ op_summary ] + timeline: + [ msprof, task_time ] + host: + sqlite: + [ ge_info ] + class_attr: + op_summary: OpSummary + task_time: TaskTime + msprof: Msprof + ge_info: GeInfo + file_attr: + op_summary: ^op_summary_\d+_\d+_\d{14}\.csv$ + task_time: ^task_time_\d+_\d+_\d{14}\.json$ + msprof: ^msprof_\d+_\d+_\d{14}\.json$ + ge_info: ge_info.db + + - version: 7.0.RC1 + dirs_pattern: + ^PROF_\d{6}_\d{17}_\w+$: + ^device_\d+$: + summary: + [ op_summary ] + timeline: + [ msprof, task_time ] + host: + sqlite: + [ ge_info ] + class_attr: + op_summary: OpSummary + task_time: TaskTime + msprof: Msprof + ge_info: GeInfo + file_attr: + op_summary: ^op_summary_\d+_\d+_\d+_\d{14}\.csv$ + task_time: ^task_time_\d+_\d+_\d+_\d{14}\.json$ + msprof: ^msprof_\d+_\d+_\d+_\d{14}\.json$ + ge_info: ge_info.db + + - version: 6.3.RC2 + dirs_pattern: + ^PROF_\d{6}_\d{17}_\w+$: + ^device_\d+$: + summary: + [ op_summary ] + timeline: + [ msprof, task_time ] + host: + sqlite: + [ ge_info ] + class_attr: + op_summary: OpSummary + task_time: TaskTime + msprof: Msprof + ge_info: GeInfo + file_attr: + op_summary: ^op_summary_\d+_\d+\.csv$ + task_time: ^task_time_\d+_\d+\.json$ + msprof: ^msprof_\d+_\d+\.json$ + ge_info: ge_info.db + + diff --git a/profiler/advisor_review/dataset/__init__.py b/profiler/advisor_review/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/dataset/cluster/__init__.py b/profiler/advisor_review/dataset/cluster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/dataset/cluster/cluster_dataset.py b/profiler/advisor_review/dataset/cluster/cluster_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..09fda2d4dcf2df2f05abb0007befb5c5c36ef824 --- /dev/null +++ b/profiler/advisor_review/dataset/cluster/cluster_dataset.py @@ -0,0 +1,165 @@ +import logging + +import os + +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.utils.utils import singleton +from profiler.cluster_analyse.common_func.file_manager import FileManager +from profiler.advisor.common import constant as const +from profiler.cluster_analyse.common_func.constant import Constant +from collections import defaultdict +from profiler.cluster_analyse.cluster_analysis import Interface +from profiler.advisor.dataset.cluster.cluster_step_trace_time_bean import ClusterStepTraceTimeBean + +logger = logging.getLogger() + + +class ClusterDataset(Dataset): + + def __init__(self, collection_path, data: dict, **kwargs) -> None: + super().__init__(collection_path, data) + + def is_cluster_analysis_output_exist(self): + """ + check whether input path is valid + """ + for file in os.listdir(self.collection_path): + if file == 'cluster_analysis_output': + print("[INFO]Cluster has been analyzed " + "because of the existence of cluster analysis output directory.") + print("[INFO]Skip Cluster analyze backend.") + return True + return False + + def cluster_analyze(self): + if self.is_cluster_analysis_output_exist(): + return + parameter = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.ANALYSIS_MODE: "all" + } + print("[INFO] cluster analysis is in the process, please wait...") + try: + Interface(parameter).run() + except Exception as e: + raise ValueError(f"Cluster analyze backend failed:{e}") from e + + def load_csv_data(self, file_name, dataBean): + csv_path = os.path.join(self.collection_path, const.CLUSTER_ANALYSIS_OUTPUT, file_name) + if not os.path.exists(csv_path): + msg = "[ERROR] cluster_step_trace_time.csv doesn't exist, terminate analysis." + raise RuntimeError(msg) + data = FileManager.read_csv_file(csv_path, dataBean) + return data + + def load_json_data(self, file_name): + json_path = os.path.join(self.collection_path, const.CLUSTER_ANALYSIS_OUTPUT, file_name) + if not os.path.exists(json_path): + msg = "[ERROR] cluster_communication.json doesn't exist, terminate analysis." + raise RuntimeError(msg) + data = FileManager.read_json_file(json_path) + return data + + +@singleton +class ClusterStepTraceTimeDataSet(ClusterDataset): + RANK = "rank" + + def __init__(self, collection_path: str, data: dict, **kwargs): + self._step_dict = defaultdict() + super().__init__(collection_path, data) + + def _parse(self): + self.cluster_analyze() + try: + step_data = self.load_csv_data(const.CLUSTER_STEP_TIME_CSV, ClusterStepTraceTimeBean) + except RuntimeError as e: + print("捕获到异常:", e) + self._step_dict = None + return False + self._step_dict = self.formate_data(step_data) + return True + + def formate_data(self, step_data: list): + step_dict = defaultdict(lambda: [0, 0, 0]) + for step_bean in step_data: + if step_bean.type == self.RANK: + step_dict[step_bean.index][0] += step_bean.compute + step_dict[step_bean.index][1] += step_bean.communication + step_dict[step_bean.index][2] += step_bean.free + return step_dict + + def get_data(self): + return self._step_dict + + +@singleton +class ClusterCommunicationDataSet(ClusterDataset): + RDMA_TIME_MS = "RDMA time(ms)" + RDMA_SIZE_MB = "RDMA size(mb)" + SDMA_TIME_MS = "SDMA time(ms)" + SDMA_SIZE_MB = "SDMA size(mb)" + RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)" + SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)" + COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info" + TRANSIT_TIME = "Transit Time(ms)" + TRANSIT_SIZE = "Transit Size(MB)" + SDMA = "SDMA" + RDMA = "RDMA" + + def __init__(self, collection_path: str, data: dict, **kwargs): + self.rank_bw_dict = defaultdict(lambda: { + self.RDMA_TIME_MS: 0, + self.RDMA_SIZE_MB: 0, + self.SDMA_TIME_MS: 0, + self.SDMA_SIZE_MB: 0, + }) + super().__init__(collection_path, data) + + @staticmethod + def compute_ratio(dividend: float, divisor: float): + if abs(divisor) < 1e-15: + return 0 + else: + return round(dividend / divisor, 4) + + def _parse(self): + self.cluster_analyze() + try: + communication_json = self.load_json_data(const.CLUSTER_COMM_JSON) + except RuntimeError as e: + print("捕获到异常:", e) + self.rank_bw_dict = None + return False + self.process(communication_json) + return True + + def process(self, communication_json: dict): + for comm_group, group_dict in communication_json.items(): + for step, step_dict in group_dict.items(): + for op, op_dict in step_dict.items(): + self.compute_bandwidth(op_dict) + + def compute_bandwidth(self, op_dict: dict): + for rank_id, rank_dict in op_dict.items(): + try: + rank = int(rank_id) + except ValueError as e: + msg = "[ERROR] Cluster_communication.json has invalid structure." + raise ValueError(msg) from e + for comm_type, bw_dict in rank_dict.get(self.COMMUNICATION_BANDWIDTH_INFO, {}).items(): + if comm_type == self.SDMA: + self.rank_bw_dict[rank][self.SDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) + self.rank_bw_dict[rank][self.SDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) + if comm_type == self.RDMA: + self.rank_bw_dict[rank][self.RDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) + self.rank_bw_dict[rank][self.RDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) + + for rank, rank_dict in self.rank_bw_dict.items(): + self.rank_bw_dict[rank][self.RDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[rank][self.RDMA_SIZE_MB], self.rank_bw_dict[rank][self.RDMA_TIME_MS]) + self.rank_bw_dict[rank][self.SDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[rank][self.SDMA_SIZE_MB], self.rank_bw_dict[rank][self.SDMA_TIME_MS]) + + def get_data(self): + return self.rank_bw_dict diff --git a/profiler/advisor_review/dataset/cluster/cluster_step_trace_time_bean.py b/profiler/advisor_review/dataset/cluster/cluster_step_trace_time_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..b108fc77a3f3408d48c79ce6b542f98427d88b0b --- /dev/null +++ b/profiler/advisor_review/dataset/cluster/cluster_step_trace_time_bean.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class ClusterStepTraceTimeBean: + STEP = "Step" + TYPE = "Type" + INDEX = "Index" + COMPUTING = "Computing" + COMMUNICATION = "Communication(Not Overlapped)" + FREE = "Free" + + def __init__(self, data: dict): + self._data = data + + @property + def step(self) -> str: + return self._data.get(self.STEP, '') + + @property + def type(self) -> str: + return self._data.get(self.TYPE, '') + + @property + def index(self) -> int: + try: + return int(self._data.get(self.INDEX)) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Index'." + raise ValueError(msg) from e + + @property + def compute(self) -> float: + try: + return float(self._data.get(self.COMPUTING, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Computing'." + raise ValueError(msg) from e + + @property + def communication(self) -> float: + try: + return float(self._data.get(self.COMMUNICATION, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Communication'." + raise ValueError(msg) from e + + @property + def free(self) -> float: + try: + return float(self._data.get(self.FREE, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Free'." + raise ValueError(msg) from e + diff --git a/profiler/advisor_review/dataset/dataset.py b/profiler/advisor_review/dataset/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7f1e40a38b8a4a26585eecfe6271cc75ea054d2d --- /dev/null +++ b/profiler/advisor_review/dataset/dataset.py @@ -0,0 +1,38 @@ +""" +dataset module +""" +import logging +import os + +from profiler.advisor.config.config import Config + +logger = logging.getLogger() + + +class Dataset: + """ + :param collection_path: dataSet absolute path + dataset base class + """ + + def __init__(self, collection_path, data=None) -> None: + if data is None: + data = {} + self.collection_path = os.path.abspath(os.path.join(Config().work_path, collection_path)) + logger.debug("init %s with %s", self.__class__.__name__, self.collection_path) + if self._parse(): + key = self.get_key() + if key not in data: + data[key] = [] + data[key].append(self) + + def _parse(self): + return None + + @classmethod + def get_key(cls): + """ + get key of dataset + :return: key + """ + return cls.__name__.rsplit('.', maxsplit=1)[-1] diff --git a/profiler/advisor_review/dataset/graph_dataset.py b/profiler/advisor_review/dataset/graph_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..951de7fd26b1f986d25285547e63b1a420968249 --- /dev/null +++ b/profiler/advisor_review/dataset/graph_dataset.py @@ -0,0 +1,53 @@ +import logging +from typing import List + +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.common.graph.graph_parser import HostGraphParser +from profiler.advisor.common.graph.graph import Graph +from profiler.advisor.utils.utils import load_parameter, lazy_property, get_file_path_from_directory + +logger = logging.getLogger() + + +class GraphDataset(Dataset): + """ + data directory dataset + """ + FILE_PATTERN = "ATT_ADVISOR_GRAPH_FILE" + + def __init__(self, collection_path, data: dict = None, **kwargs) -> None: + self.graph_files: List[HostGraphParser] = [] + super().__init__(collection_path, data) + + def _parse(self): + graph_list = get_file_path_from_directory(self.collection_path, + lambda file: file.endswith( + load_parameter(self.FILE_PATTERN, "_Build.txt"))) + + for graph_file_path in graph_list[-1:]: + logger.info("Prepare to parse %s as default graph.", graph_file_path) + graph_file = HostGraphParser(graph_file_path) + self.graph_files.append(graph_file) + return self.graph_files + + @lazy_property + def graphs(self) -> List[Graph]: + """ + get a list of graphs + return: List[Graph] + """ + graphs = [] + for parser in self.graph_files: + graph = Graph(nodes=parser.nodes, + edges=parser.edges, + name="Default") + graph.build() + graphs.append(graph) + graphs.sort(key=lambda g: g.name) + if len(self.graph_files) >= 1: + del self.graph_files[0] # remove previous useless data + return graphs + + def is_empty(self) -> bool: + """check empty graph dataset""" + return len(self.graph_files) == 0 diff --git a/profiler/advisor_review/dataset/profiling/__init__.py b/profiler/advisor_review/dataset/profiling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/dataset/profiling/builder_base.py b/profiler/advisor_review/dataset/profiling/builder_base.py new file mode 100644 index 0000000000000000000000000000000000000000..2bfe14f9462b701db2a4ede1d539a07659f48ae8 --- /dev/null +++ b/profiler/advisor_review/dataset/profiling/builder_base.py @@ -0,0 +1,39 @@ +""" +profiling base +""" +import logging +from typing import Dict, List + +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser +from profiler.advisor.utils.utils import join_prof_path + +logger = logging.getLogger() + + +class ProfilingBuilderBase: + """ + profiling base + """ + DATA_LIST: List[Dict] = [] + + def __init__(self, path) -> None: + self._path = path + + def parse_data(self) -> bool: + """ + parse data for file in data_dir + """ + if isinstance(self, ProfilingParser): + return True + ret = False + for data in self.DATA_LIST: + class_name = data.get("class_name") + if class_name is not None: + if data.get("subdir_name"): + data_class = data.get("class_name")(join_prof_path(self._path, data.get("subdir_name"))) + else: + data_class = data.get("class_name")(self._path) + if data_class.parse_data(): + setattr(self, str(data.get("attr_name")), data_class) + ret = True + return ret diff --git a/profiler/advisor_review/dataset/profiling/db_manager.py b/profiler/advisor_review/dataset/profiling/db_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..c9fb73c7cf69d94c3ca1aba8c726f574d63cd1a3 --- /dev/null +++ b/profiler/advisor_review/dataset/profiling/db_manager.py @@ -0,0 +1,70 @@ +""" +connection manager +""" +import os +import re +from typing import List + +from sqlalchemy import MetaData, create_engine + + +class ConnectionManager: + """ + Connection Manager + """ + + def __init__(self, path, db_name): + self.db_path = os.path.join(path, db_name) + self.connection = create_engine(f'sqlite:///{self.db_path}') + self.metadata = MetaData() + self.metadata.reflect(bind=self.connection) + + def __call__(self, *args, **kwargs): + return self.connection + + @staticmethod + def check_db_exists(db_path:str, dbs:List) -> bool: + """ + check db exists + """ + if not os.path.isdir(db_path): + return False + for prof_db in dbs: + if not os.access(db_path, os.R_OK) or prof_db not in os.listdir(db_path): + return False + return True + + def check_table_exists(self, tables:List) -> bool: + """ + check table exists + """ + for table in tables: + if table not in self.metadata.tables: + return False + return True + + def check_column_exists(self, table_name:str, columns:List) -> bool: + """ + check column exists + """ + if table_name not in self.metadata.tables: + return False + for column in columns: + if column not in self.metadata.tables[table_name].columns: + return False + return True + + @classmethod + def get_connection(cls, path, dbs, tables=None, is_host=False): + """ + get connection + """ + if is_host: + pattern = r"/device_[0-9]" + path = re.sub(pattern, "/host", path) + if not cls.check_db_exists(path, dbs): + return None + conn = cls(path, dbs) + if tables and not conn.check_table_exists(tables): + return None + return conn diff --git a/profiler/advisor_review/dataset/profiling/device_info.py b/profiler/advisor_review/dataset/profiling/device_info.py new file mode 100644 index 0000000000000000000000000000000000000000..b58930777f969d023eab7885a9095d46aa7ba6ea --- /dev/null +++ b/profiler/advisor_review/dataset/profiling/device_info.py @@ -0,0 +1,61 @@ +""" +profiling info +""" +import json +import logging + +from profiler.advisor.config.config import Config +from profiler.advisor.utils.utils import get_file_path_from_directory + +logger = logging.getLogger() + + +class DeviceInfoParser: + """ + profiling info + device_id device 名称信息 + "aiv_num" ai vector 个数 + "ai_core_num" aicore 个数 + """ + DATA_LIST = [] + + def __init__(self, path) -> None: + self._path = path + + def parse_data(self) -> bool: + """ + parse profiling data + :return: true for success or false + """ + file_list = get_file_path_from_directory(self._path, lambda x: x.startswith("info.json.")) + if not file_list: + return False + for info in file_list: + if self._parse(info): + return True + return False + + @staticmethod + def _parse(info_file: str) -> bool: + if info_file.endswith("done"): + return False # skip info.json.0.done + try: + with open(info_file, encoding="utf-8") as file: + info = json.load(file) + except (IOError, ValueError) as error: + logger.error("Parse json info file %s failed : %s", info_file, error) + return False + if "DeviceInfo" not in info: + logger.error("No device info in json info file %s", info_file) + return False + config = Config() + for device_info in info["DeviceInfo"]: + if "id" in device_info: + config.set_config("device_id", device_info["id"]) + if "aiv_num" in device_info: + config.set_config("aiv_num", device_info["aiv_num"]) + if "ai_core_num" in device_info: + config.set_config("ai_core_num", device_info["ai_core_num"]) + return True + logger.error("No ai_core_num in json info file %s", info_file) + return False diff --git a/profiler/advisor_review/dataset/profiling/info_collection.py b/profiler/advisor_review/dataset/profiling/info_collection.py new file mode 100644 index 0000000000000000000000000000000000000000..b1f84313bb7980ea2186d2727db51b5fba49e12e --- /dev/null +++ b/profiler/advisor_review/dataset/profiling/info_collection.py @@ -0,0 +1,270 @@ +""" +profiling info +""" +import decimal +import logging + +from profiler.advisor.utils.utils import lazy_property + +logger = logging.getLogger() + + +class Info: + """ + op info + """ + _attr_pre_fix_list = [""] + + def add_attr(self, key: str, value: str): + """ + add attr to op info + :param key: op info key + :param value: op info value + :return: None + """ + if not key or hasattr(self, key): + return + setattr(self, key, value) + + def has_attr(self, key: str, strict_mode=False): + """ + check if op info has attr key + :param key: attr key + :return: true or false + """ + if strict_mode: + return hasattr(self, key) + for prefix in self._attr_pre_fix_list: + attr = prefix + key + if hasattr(self, attr): + return True + return False + + def get_attr(self, key, strict_mode=False): + """ + get attr value by key + :param key: attr key + :return: attr value + """ + if strict_mode: + if hasattr(self, key): + return getattr(self, key) + else: + for prefix in self._attr_pre_fix_list: + attr = prefix + key + if key.startswith("mac") and prefix == "aiv_": + # e.g mac_ratio must match aic_mac_ratio, not aiv_mac_ratio + continue + if key.startswith("vec") and prefix == "aic_": + # e.g vec_ratio must match aiv_vec_ratio, not aic_vec_ratio + continue + if hasattr(self, attr): + return getattr(self, attr) + return "" + + def get_float_attr(self, attr, strict_mode=False): + """ + get attr value by key + :param key: attr key + :return: attr value + """ + try: + return float((self.get_attr(attr, strict_mode))) + except (ValueError, FloatingPointError): + pass + return 0 + + def get_decimal_attr(self, attr, strict_mode=False): + """ + get attr value by key + :param key: attr key + :return: attr value + """ + try: + return decimal.Decimal((self.get_attr(attr, strict_mode))) + except (ValueError, decimal.InvalidOperation): + pass + return decimal.Decimal(0) + + def get_attrs(self) -> dict: + """ + get attr list + :return: attr list + """ + return self.__dict__ + + +class OpInfo(Info): + """ + summary info + """ + + _attr_pre_fix_list = ["", "aic_", "aiv_"] + _mac_ratio_attrs = ["mac_ratio", "mac_fp16_ratio", "mac_int8_ratio", "aic_mac_ratio"] + _aicore_time_key = ["aicore_time", "aiv_time"] + _total_cycles_key = ["total_cycles", "aic_total_cycles", "aiv_total_cycles"] + + def __lt__(self, other): + return self.get_float_attr("task_start_time") < other.get_float_attr("task_start_time") + + @lazy_property + def is_cube_op(self) -> bool: + """ + check type of operator if cube or not + """ + for attr in self._mac_ratio_attrs: + if hasattr(self, attr): + try: + if float(getattr(self, attr)) > 0: + if hasattr(self, "ffts_type") and getattr(self, "ffts_type") == "1": + logger.warning( + "ffts type of op %s is vector buf mac ratio is not 0", getattr(self, "op_name") + ) + return True + except ValueError: + pass + # not cube op + if hasattr(self, "ffts_type") and getattr(self, "ffts_type") == "0": + logger.warning("ffts type of op %s is cube but mac ratio is 0", getattr(self, "op_name")) + return False + + @lazy_property + def has_mac_ratio(self) -> bool: + """ + check if op_info has mac ratio + """ + for attr in self._mac_ratio_attrs: + if attr in self.__dict__: + return True + return False + + def attr_sum(self, attr_list): + """sum of a list attrs""" + total = 0 + for attr in attr_list: + total += self.get_float_attr(attr, strict_mode=True) + return total + + def get_aicore_time(self): + """ + get sum of aicore time and ai vector core time + """ + return self.attr_sum(self._aicore_time_key) + + def get_total_cycles(self): + """ + get sum of total cycle for aicore and ai vector core + """ + return self.attr_sum(self._total_cycles_key) + + +class TaskInfo: + """ + task info + """ + EVENT_TYPE = {"metadata": ['M'], "duration": ['B', 'E'], "complete": ['X'], 'flow': ['s', 't', 'f']} + + def __init__(self, content: dict) -> None: + self._name = content.get("name", "") + self._pid = content.get("pid", 0) + self._tid = content.get("tid", 0) + self._start_time = float(content.get("ts", 0.0)) + self._dur = float(content.get("dur", 0.0)) + self._args = content.get("args", {}) + self._cat = content.get("cat", "") + self._id = content.get("id", "") + + @property + def pk_id(self): + """ + get id + :return: id + """ + return self._id + + @property + def pid(self): + """ + get pid + :return: pid + """ + return self._pid + + @property + def tid(self): + """ + get tid + :return: tid + """ + return self._tid + + @property + def task_type(self): + """ + get pid + :return: pid + """ + return self._args.get("Task Type", "NA") + + @property + def start_time(self): + """ + get starttime + :return: starttime + """ + return self._start_time + + @property + def end_time(self): + """ + get endtime + :return: endtime + """ + return self._start_time + self._dur + + @property + def dur(self): + """ + get duration + :return: duration + """ + return self._dur + + @property + def name(self): + """ + get task name + :return: task name + """ + return self._name + + @property + def stream_id(self): + """ + get stream_id + :return: steram id + """ + return self._args.get("Stream Id", "NA") + + @property + def task_id(self): + """ + get task id + :return: task_id + """ + return self._args.get("Task Id", "NA") + + @property + def args(self): + """ + get args of task + :return: args + """ + return self._args + + @property + def cat(self): + """ + get category of task + """ + return self._cat diff --git a/profiler/advisor_review/dataset/profiling/profiling_dataset.py b/profiler/advisor_review/dataset/profiling/profiling_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..46d4a4fe8b12a419f6d0d7472f9776369e122f03 --- /dev/null +++ b/profiler/advisor_review/dataset/profiling/profiling_dataset.py @@ -0,0 +1,79 @@ +import logging +import os + +import yaml +from profiler.advisor.common import constant +from profiler.advisor.common.profiling.ge_info import GeInfo +from profiler.advisor.common.profiling.msprof import Msprof +from profiler.advisor.common.profiling.op_summary import OpSummary +from profiler.advisor.common.profiling.tasktime import TaskTime +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.dataset.profiling.device_info import DeviceInfoParser +from profiler.advisor.utils.utils import join_prof_path + + +logger = logging.getLogger() + + +class ProfilingDataset(Dataset): + PROF_TYPE = "" + + def __init__(self, collection_path, data: dict, **kwargs) -> None: + self.cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION) + self.PROF_TYPE = kwargs.get("profiling_type", constant.DEFAULT_PROFILING_TYPE) + self.patterns = self.parse_pattern() + self.current_version_pattern = self.get_current_version_pattern() + super().__init__(collection_path, data) + + def _parse(self): + info = DeviceInfoParser(self.collection_path) + if info.parse_data(): + self._info = info + ret = False + if self.current_version_pattern is not None: + self.build_from_pattern(self.current_version_pattern["dirs_pattern"], self.collection_path) + ret = True + + return ret + + def build_from_pattern(self, dirs_pattern, current_path): + if isinstance(dirs_pattern, dict): + for key, value in dirs_pattern.items(): + self.build_from_pattern(value, join_prof_path(current_path, key)) + elif isinstance(dirs_pattern, list): + for item in dirs_pattern: + data_class = globals()[self.current_version_pattern.get('class_attr').get(item)] + data_class.FILE_PATTERN = self.current_version_pattern.get('file_attr').get(item) + data_object = data_class(current_path) + is_success = data_object.parse_data() + if is_success: + setattr(self, item, data_object) + else: + logger.warning("Skip parse %s from local path %s", self.current_version_pattern.get('class_attr').get(item), current_path) + else: + logger.warning(f"Unsupported arguments : %s to build %s", dirs_pattern, self.__class__.__name__) + + def get_current_version_pattern(self): + for version_config_dict in self.patterns['versions']: + if version_config_dict['version'] == self.cann_version: + return version_config_dict + return dict() + + def parse_pattern(self, config_path="config/profiling_data_version_config.yaml"): + + if not os.path.isabs(config_path): + config_path = os.path.join(os.path.dirname(__file__), + "../", "../", config_path) + + if not os.path.exists(config_path): + logger.warning("Skip parse profiling dataset, because %s does not exist.", config_path) + return [] + + with open(config_path, 'r') as f: + patterns = yaml.safe_load(f) + + return patterns + + def collection_path(self): + """collection_path""" + return self.collection_path diff --git a/profiler/advisor_review/dataset/profiling/profiling_parser.py b/profiler/advisor_review/dataset/profiling/profiling_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..bb4caeb29e5c94cbc4373b1d6b10e32f3e10e02e --- /dev/null +++ b/profiler/advisor_review/dataset/profiling/profiling_parser.py @@ -0,0 +1,132 @@ +import csv +import json +import os +import re +from typing import List, Dict + +from profiler.advisor.dataset.profiling.info_collection import logger +from profiler.advisor.utils.utils import get_file_path_from_directory, SafeOpen, format_excel_title + + +class ProfilingParser: + """ + profiling + """ + FILE_PATTERN = "" + FILE_PATTERN_MSG = "" + FILE_INFO = "" + FILE_PATH = "" + + def __init__(self, path: str) -> None: + self._path = path + self._raw_data: List[List[str]] = [] + self._filename = "" + + @staticmethod + def file_match_func(pattern): + """file match function""" + return lambda x: re.search(re.compile(pattern), x) + + def parse_data(self) -> bool: + """ + pase task time file + :return: true or false + """ + if self._parse_from_file(): + return True + return False + + def _parse_from_file(self): + file_list = get_file_path_from_directory(self._path, self.file_match_func(self.FILE_PATTERN)) + if not file_list: + return False + ## get last file + file = file_list[-1] + self.FILE_PATH = file + if len(file_list) > 1: + logger.warning("Multiple copies of %s were found, use %s", self.FILE_INFO, file) + return self.parse_from_file(file) + + @staticmethod + def get_float(data) -> float: + """ + get float or 0.0 + """ + try: + return float(data) + except (FloatingPointError, ValueError): + return 0.0 + + def parse_from_file(self, file): + """ + parse from file + """ + return False + + @staticmethod + def _check_csv_file_format(csv_file_name: str, csv_content: List[List[str]]): + if not csv_content: + logger.error("%s is empty", csv_file_name) + return False + return True + + def _parse_csv(self, file, check_csv=True) -> bool: + logger.debug("Parse file %s", file) + self._filename = os.path.splitext(os.path.basename(file))[0] + with SafeOpen(file, encoding="utf-8") as csv_file: + try: + csv_content = csv.reader(csv_file) + for row in csv_content: + self._raw_data.append(row) + if check_csv and not self._check_csv_file_format(file, self._raw_data): + logger.error("Invalid csv file : %s", file) + return False + except OSError as error: + logger.error("Read csv file failed : %s", error) + return False + + if not csv_file: + return False + if not self._raw_data: + logger.warning("File %s has no content", file) + return False + return True + + def _parse_json(self, file) -> bool: + logger.debug("Parse file %s", file) + self._filename = os.path.splitext(os.path.basename(file))[0] + try: + with open(file, encoding="utf-8") as json_file: + self._raw_data = json.load(json_file) + except (OSError, ValueError) as error: + logger.error("Parse json file %s failed : %s", file, error) + return False + return True + + def get_raw_data(self): + """ + get raw file name and data + """ + return self._filename, self._raw_data + + @staticmethod + def _get_csv_title(data: List, number=0, title_index=0): + """ + number = 0 replace (us) (ns).. + other replace " " to "_" + title_index: position of title default 0 + """ + title_dict: Dict[int, str] = {} + for idx, title in enumerate(data[title_index]): + if number == 0: + title_dict[idx] = format_excel_title(title) + else: + title_dict[idx] = title.replace(" ", "_") + return title_dict + + @property + def path(self): + """ + path + """ + return self._path diff --git a/profiler/advisor_review/dataset/timeline_event_dataset.py b/profiler/advisor_review/dataset/timeline_event_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..94b6fdfef78c044e37e24772699ed7ea67b0da30 --- /dev/null +++ b/profiler/advisor_review/dataset/timeline_event_dataset.py @@ -0,0 +1,220 @@ +import logging +from typing import List + +import ijson +from profiler.advisor.dataset.dataset import Dataset +from tqdm import tqdm + +from profiler.advisor.common import constant as const +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.utils.utils import get_file_path_from_directory +from profiler.advisor.utils.utils import singleton + +logger = logging.getLogger() + + +class OpCompileCollector: + def __init__(self): + self._total_op_compile_counter = 0 + self._total_op_compile_time = 0.0 + + @property + def total_time(self): + return self._total_op_compile_time + + @property + def total_count(self): + return self._total_op_compile_counter + + def is_empty(self): + return self._total_op_compile_counter == 0 + + def update(self, event: TimelineEvent): + self._total_op_compile_time += float(event.dur) + self._total_op_compile_counter += 1 + + def unset(self): + self._total_op_compile_counter = 0 + self._total_op_compile_time = 0.0 + + +@singleton +class TimelineEventDataset(Dataset): + + def __init__(self, collection_path, data: dict, **kwargs) -> None: + self._ops_with_task_type = {} + self._ops_with_stack = {} + self._ops_compile = OpCompileCollector() + self._torch_to_npu = {} + self._acl_to_npu = set() + self._aten: List[str] = [] + self._optimizer: List[str] = [] + self.timeline_dir = collection_path + self.timeline_data_list = get_file_path_from_directory(collection_path, lambda file: file.endswith("trace_view.json")) + self.dataset_len = None + self.analysis_mode = kwargs.get("analysis_mode") + self.task_type = kwargs.get("task_type") + self.cann_version = kwargs.get("cann_version") + self.torch_version = kwargs.get("torch_version") + + if self.analysis_mode in ["fusion_ops", "all"]: + logger.info("Load fusion operators database for cann version '%s' and torch version '%s'", + self.cann_version, self.torch_version) + + super().__init__(collection_path, data) + + if self.analysis_mode in ["op_stack", "all"]: + self._task_op_names = list(set([event_key.split("-")[0] for event_key in self._ops_with_task_type.keys()])) + + self._post_process() + + + @property + def ops_with_stack(self): + return self._ops_with_stack + + @property + def ops_compile(self): + return self._ops_compile + + @property + def torch_to_npu(self): + return self._torch_to_npu + + @property + def acl_to_npu(self): + return self._acl_to_npu + + @property + def ops_with_task_type(self): + return self._ops_with_task_type + + @property + def task_op_names(self): + return self._task_op_names + + @property + def optimizer(self): + return self._optimizer + + @property + def aten(self): + return self._aten + + def _parse(self): + + if len(self.timeline_data_list) == 0: + logger.warning("Please ensure trace_view.json in %s, skip timeline analysis.", self.timeline_dir) + return False + + if len(self.timeline_data_list) > 1: + logger.warning("Please ensure only one trace_view.json in %s, there will analyze first timeline profiling data.", self.timeline_dir) + self.timeline_data_list = [self.timeline_data_list[0]] + + result = self.parse_data_with_generator(self._add_event) + + if not self.dataset_len: + self.dataset_len = len(result) + + return True + + def parse_data_with_generator(self, func): + result = [] + try: + with open(self.timeline_data_list[0], "r") as f: + for i, event in tqdm(enumerate(ijson.items(f, "item")), + leave=False, ncols=100, desc="Building dataset for timeline analysis", + total=self.dataset_len): + func_res = func(index=i, event=event) + if func_res is not None: + result.append(func_res) + except Exception as e: + logger.warning("Error %s while parsing file %s, continue to timeline analysis", e, + self.timeline_data_list[0]) + return result + + def _add_ops_with_task_type(self, event): + key = f"{event.name}-{event.ts}" + self._ops_with_task_type[key] = TimelineEvent( + { + const.TASK_TYPE: event.args.get(const.TASK_TYPE), + "task_id": event.args.get("Task Id"), + "tid": event.tid, + "name": event.name, + "ts": str(event.ts) + } + ) + + def _add_ops_with_stack(self, event): + self._ops_with_stack[str(event.ts)] = TimelineEvent({"name": event.name, "dataset_index": event.dataset_index}) + + def _add_torch_to_npu(self, event): + key = f"{event.ph}-{event.id}" + self._torch_to_npu[key] = TimelineEvent({"tid": event.tid, "ts": str(event.ts)}) + + def _add_acl_to_npu(self, event): + # op with task type equals to ai_cpu which derived from acl_to_npu do not have stacks + self._acl_to_npu.add(str(event.ts)) + + def _add_op_compile(self, event: TimelineEvent): + if event.name == const.OP_COMPILE_NAME or event.args.get("id") == const.OP_COMPILE_ID: + self._ops_compile.update(event) + + def _add_optimizer(self, event: TimelineEvent): + self._optimizer.append(TimelineEvent({"name": event.name, "dataset_index": event.dataset_index})) + + def _add_aten(self, event: TimelineEvent): + self._aten.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur + })) + + def _add_event(self, index, event): + event["dataset_index"] = index + if not isinstance(event, TimelineEvent): + event = TimelineEvent(event) + + self._add_op_compile(event) + if self.analysis_mode == "fusion_ops": + self._add_event_for_fusion_ops(event) + elif self.analysis_mode == "op_stack": + self._add_event_for_op_stack(event) + else: + self._add_event_for_fusion_ops(event) + self._add_event_for_op_stack(event) + return True + + def _add_event_for_fusion_ops(self, event): + if event.name.lower().startswith(f"{const.ATEN}{const.ATEN_SEP}") or event.name.lower().startswith( + f"{const.NPU}{const.ATEN_SEP}"): + self._add_aten(event) + return + + if event.name.startswith(f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}"): + self._add_optimizer(event) + return + + def _add_event_for_op_stack(self, event): + if event.name.lower() == const.TORCH_TO_NPU: + self._add_torch_to_npu(event) + return + + if event.args.get(const.CALL_STACKS): + self._add_ops_with_stack(event) + return + + if event.args.get(const.TASK_TYPE) and event.args.get(const.TASK_TYPE) in [const.AI_CORE, const.AI_CPU]: + self._add_ops_with_task_type(event) + return + + if event.name and event.ts and event.name == const.ACL_TO_NPU: + self._add_acl_to_npu(event) + return + + def _post_process(self): + # eliminate sub aten operator of the first level aten operator by 'ts' and 'dur', + # keep the first level aten operator contiguous + formated_atens = [] + for aten_event in sorted(self._aten, key=lambda x: x.get("ts", -1)): + if not formated_atens or not formated_atens[-1].ts_include(aten_event): + formated_atens.append(aten_event) + self._aten = formated_atens diff --git a/profiler/advisor_review/display/__init__.py b/profiler/advisor_review/display/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/display/html/__init__.py b/profiler/advisor_review/display/html/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/display/html/render.py b/profiler/advisor_review/display/html/render.py new file mode 100644 index 0000000000000000000000000000000000000000..8ea7c9e0fc22c7da71a673e399fcfc231fbf1453 --- /dev/null +++ b/profiler/advisor_review/display/html/render.py @@ -0,0 +1,45 @@ +import os +import logging +from typing import List, Dict + +from jinja2 import Environment, FileSystemLoader +from profiler.advisor.common import constant + +from profiler.advisor.config.config import Config +from profiler.advisor.utils.utils import singleton, safe_write + +logger = logging.getLogger() + + +@singleton +class HTMLRender: + def __init__(self): + self.html = "" + self.render_list: Dict[str, List] = {} + + def render_html(self, template_dir: str = "templates", template_name: str = "main.html", + template_header=constant.DEFAULT_TEMPLATE_HEADER): + self.html = self.render_template("main", template_dir, template_name, render_list=self.render_list, + template_header=template_header) + + def render_template(self, key: str, template_dir: str, template_name: str, **kwargs): + if not os.path.isabs(template_dir): + template_dir = os.path.join(os.path.dirname(__file__), template_dir) + + env = Environment(loader=FileSystemLoader(template_dir), + autoescape=True) + template = env.get_template(template_name) + rendered_html = template.render(**kwargs) + if key not in self.render_list: + self.render_list[key] = [] + self.render_list[key].append(rendered_html) + return rendered_html + + def save_to_file(self, save_path: str): + if not save_path.endswith(".html"): + logger.error("Skip save html file because file name must endswith `.html`, " + "but got %s.", os.path.basename(save_path)) + return + + safe_write(self.html, save_path) + logger.info("Save suggestion to %s.", os.path.join(Config().work_path, save_path)) diff --git a/profiler/advisor_review/display/html/templates/__init__.py b/profiler/advisor_review/display/html/templates/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/display/html/templates/affinity_api.html b/profiler/advisor_review/display/html/templates/affinity_api.html new file mode 100644 index 0000000000000000000000000000000000000000..4d12c3e37536392d122f85fc6ef3a4fcc123ef77 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/affinity_api.html @@ -0,0 +1,50 @@ +{% if result|length > 0 %} +
+

Affinity API Issues

+
+ The analysis results of following affinity APIs are based on runtime env + cann-{{ cann_version }} + and + torch-{{ torch_version }} + +
+ + {% if empty_stacks %} + Suggestion: + These APIs have no code stack. If parameter 'with_stack=False' was set while profiling, please refer to + Ascend PyTorch Profiler to set + 'with_stack=True'. Otherwise, ignore following affinity APIs due to backward broadcast lack of stack. + {% endif %} + + {% for api_name, stacks in result.items() %} + + {% if empty_stacks %} +
{{api_name|safe}}
+ + {% else %} + +
{{api_name|safe}}
+
+ +
+ {% for stack in stacks %} +
No.{{loop.index|safe}} code stack, called {{stack[1]|safe}} times
+ + {% endfor %} +
+
+ {% endif %} + + {% endfor %} + +
+ +
+
+{% endif %} diff --git a/profiler/advisor_review/display/html/templates/cluster_analysis.html b/profiler/advisor_review/display/html/templates/cluster_analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..32379d56fcb87a78269612107d1b7634b722d8d8 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/cluster_analysis.html @@ -0,0 +1,49 @@ +
+

{{title|safe}}

+
+
+ + {% if result.get("Description") %} +
Description
+ + {% endif %} + + {% if result.get("Suggestion") %} +
Suggestion
+ + {% endif %} + + {% if result.get("details") %} +
details
+
+ {% for item in result.get("details") %} + + + {% for header in item.get("headers") %} + + {% endfor %} + + {% for row in item.get("data") %} + + {% for element in row %} + {% if element is number %} + + {% else %} + + {% endif %} + {% endfor %} + + {% endfor %} +
{{ header }}
{{ element|round(2) }}{{ element }}
+ {% endfor %} +
+ {% endif %} + +
+ +
+
\ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/compute_analysis.html b/profiler/advisor_review/display/html/templates/compute_analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..e1907c091b705969004bf709db24211c66c38107 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/compute_analysis.html @@ -0,0 +1,29 @@ +
+

Abnormal Performance Operator

+
+ {{table.get("title")}} + + + + {% for header in table.get("headers") %} + + {% endfor %} + + {% for row in table.get("rows") %} + + {% for element in row %} + {% if element is number %} + + {% else %} + + {% endif %} + {% endfor %} + + {% endfor %} +
{{ header }}
{{ element|round(2) }}{{ element }}
+ {% if call_stack %} + call stack:
+ {{call_stack}} + {% endif %} +
+
\ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/fusion.html b/profiler/advisor_review/display/html/templates/fusion.html new file mode 100644 index 0000000000000000000000000000000000000000..605a9d748f7d4499a603efb87bc310fab9bc02f3 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/fusion.html @@ -0,0 +1,47 @@ +{% if candidates|length > 0 %} +
+

Fusion Issues

+
+
+ {% for node in candidates %} +
{{node.op_pass|safe}}
+
+ + + + + + + + + + + +
StructureCountsElapsed Time(us)
{{ node.fusion_pattern|safe }}{{ node.counts|safe }}{{ node.total_duration|safe }}
+
+ {% for match in node.matches %} +
SubGraph {{ loop.index|safe }}
+
+ + + + + + + {% for node in match %} + + + + + + {% endfor %} +
OP NameOP TypeElapsed Time(us)
{{ node.op_name|safe }}{{ node.dtype|safe }}{{ node.duration|safe }}
+
+ {% endfor %} +
+
+ {% endfor %} +
+
+
+{% endif %} diff --git a/profiler/advisor_review/display/html/templates/main.html b/profiler/advisor_review/display/html/templates/main.html new file mode 100644 index 0000000000000000000000000000000000000000..3727125b419547fc6a9ac9743eab34e1e1b76256 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/main.html @@ -0,0 +1,203 @@ + + + + + + + +
+

Performance Optimization Suggestions

+{% for key, renders in render_list.items() %} + {% if key == 'operator'%} +
+

computation

+
+ {% for render in renders %} + {{render|safe}} + {% endfor %} +
+
+ {% else %} +
+

{{ key }}

+
+ {% for render in renders %} + {{render|safe}} + {% endfor %} +
+
+ {% endif %} +{% endfor %} + +
+ + + + + \ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/operator_ai_cpu.html b/profiler/advisor_review/display/html/templates/operator_ai_cpu.html new file mode 100644 index 0000000000000000000000000000000000000000..b3235a88022fc3973ae0098f543d94cc4b7fac25 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/operator_ai_cpu.html @@ -0,0 +1,61 @@ +
+

AICPU Issues

+
+ + + + + + + + + + + + + +
DescriptionSuggestionElapsed Time(us)Time Ratio
{{ format_result.record.optimization_item.description|safe }}{{ format_result.suggestion|safe }}{{ format_result.task_duration|safe }}{{ format_result.record.statistics_item.task_duration_ratio|safe }}
+
+ {% for op_type, op_info in format_result.statistic %} +
{{ op_type|safe }}
+
+ + + + + + + + + + + +
Operator TypeCountsElapsed Time(us)
{{ op_info.summary.op_type|safe }}{{ op_info.summary.counts|safe }}{{ op_info.summary.total_duration|safe }}
+
+ {% for trace_stack, info in op_info.op_info_list %} +
+ {{ info.summary.op_type|safe }} | Input DType:({{info.op_info_list[0].input_data_types|safe}}) | Output DType:({{info.op_info_list[0].output_data_types|safe}}) | Counts:{{ info.summary.counts|safe}} | Elapsed Time(us):{{ + info.summary.total_duration|safe}} +
+
+ {% if info.op_info_list[0].suggestions|length > 0 %} +
+ {% for suggestion in info.op_info_list[0].suggestions %} +

+ Suggestion {{ loop.index|safe }}: {{suggestion|safe}} +

+ {% endfor %} +
+ {% else %} +

Suggestion 1: Modify code to avoid AICPU operator

+ {% endif %} +
+ {{ info.op_info_list[0].stack_info|safe }} +
+ {% endfor %} +
+
+ {% endfor %} +
+
+
\ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/operator_block_dim.html b/profiler/advisor_review/display/html/templates/operator_block_dim.html new file mode 100644 index 0000000000000000000000000000000000000000..4e2c832f623a4c0a0f315ebdc2b7a97aeb1996a1 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/operator_block_dim.html @@ -0,0 +1,38 @@ +
+

Block Dim Issues

+
+ + + + + + + + + + + + + +
DescriptionSuggestionElapsed Time(us)Time Ratio
{{ format_result.record.optimization_item.description|safe }}{{ format_result.suggestion|safe }}{{ format_result.task_duration|safe }}{{ format_result.record.statistics_item.task_duration_ratio|safe }}
+
+ {% for op_type, op_info in format_result.statistic %} +
{{ op_type|safe }}
+
+ + + + + + + + + + + +
Operator TypeCountsElapsed Time(us)
{{ op_info.summary.op_type|safe }}{{ op_info.summary.counts|safe }}{{ op_info.summary.total_duration|safe }}
+
+ {% endfor %} +
+
+
\ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/operator_dispatch.html b/profiler/advisor_review/display/html/templates/operator_dispatch.html new file mode 100644 index 0000000000000000000000000000000000000000..c805086354a41f7f98a803b66b3b666c59393899 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/operator_dispatch.html @@ -0,0 +1,37 @@ +{% if optimizers|length > 0 %} +
+

Operator Dispatch Issues

+
+ + + + + + + {% for optimizer in optimizers %} + + + + + {% endfor %} +
DescriptionSuggestion
{{ optimizer.description |safe }}{{ optimizer.suggestion|safe }}
+ + + + + + + + + {% for issue in issues %} + + + + + + {% endfor %} +
IssueCountsElapsed Time(us)
{{ issue.op_name |safe }}{{ issue.counts |safe }}{{ issue.total_time |safe }}
+
+ +
+{% endif %} \ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/operator_dynamic_shape.html b/profiler/advisor_review/display/html/templates/operator_dynamic_shape.html new file mode 100644 index 0000000000000000000000000000000000000000..59920b6c9ec276c9edddfd1906a31b41fb106e26 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/operator_dynamic_shape.html @@ -0,0 +1,15 @@ +
+

Operator Dynamic Shape Issues

+
+ + + + + + + + + +
DescriptionSuggestion
{{ format_result.record.optimization_item.description|safe }}{{ format_result.suggestion|safe }}
+
+
\ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/operator_no_bound.html b/profiler/advisor_review/display/html/templates/operator_no_bound.html new file mode 100644 index 0000000000000000000000000000000000000000..cfbd20baad208216d2d9a1ee856702a163a6abfa --- /dev/null +++ b/profiler/advisor_review/display/html/templates/operator_no_bound.html @@ -0,0 +1,38 @@ +
+

Operator No Bound Issues

+
+ + + + + + + + + + + + + +
DescriptionSuggestionElapsed Time(us)Time Ratio
{{ format_result.record.optimization_item.description|safe }}{{ format_result.suggestion|safe }}{{ format_result.task_duration|safe }}{{ format_result.record.statistics_item.task_duration_ratio|safe }}
+
+ {% for op_type, op_info in format_result.statistic %} +
{{ op_type|safe }}
+
+ + + + + + + + + + + +
Operator TypeCountsElapsed Time(us)
{{ op_info.summary.op_type|safe }}{{ op_info.summary.counts|safe }}{{ op_info.summary.total_duration|safe }}
+
+ {% endfor %} +
+
+
\ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/overall_analysis.html b/profiler/advisor_review/display/html/templates/overall_analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..ec61ae224ff2da59f2a80a9b4b10117d4c4c7c7a --- /dev/null +++ b/profiler/advisor_review/display/html/templates/overall_analysis.html @@ -0,0 +1,15 @@ +

Model Profiling Time Distribution

+ + + {% for header in headers %} + + {% endfor %} + + {% for row in rows %} + + {% for element in row %} + + {% endfor %} + + {% endfor %} +
{{ header }}
{{ element }}
\ No newline at end of file diff --git a/profiler/advisor_review/display/html/templates/timeline_analysis.html b/profiler/advisor_review/display/html/templates/timeline_analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..b5ea89124277e05e7fdea63a34704df52bb322d4 --- /dev/null +++ b/profiler/advisor_review/display/html/templates/timeline_analysis.html @@ -0,0 +1,34 @@ +
+

{{title|safe}}

+
+
+
+ {% if result.get("img") %} +
+ Image +
+ {% endif %} + + {% if result.get("current") %} + + {% endif %} + + {% if result.get("bottlenect") %} + + {% endif %} + + {% if result.get("advice") %} + + {% endif %} + +
+
+
+
diff --git a/profiler/advisor_review/fusion_operators_api_analysis.ipynb b/profiler/advisor_review/fusion_operators_api_analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..dcc71ba3c139f630c07545340e61c66b1f29d929 --- /dev/null +++ b/profiler/advisor_review/fusion_operators_api_analysis.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../..\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "profiling_path = \"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 融合算子API识别" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "指定profiling路径后,可以自动识别其中包含的融合算子并给出对应的torch_npu api和需要修改的代码堆栈。基于给定堆栈可以快速定位到需要修改的代码段,替换torch_npu api后,能够减少pytorch侧的小算子的下发,进而提升模型训练速度。" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "timeline_fusion_ops_result = interface.get_result(\"schedule\", \"timeline_fusion_ops\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestion
timeline_fusion_opsFound 2 apis to be replaced based on the runtime env cann-8.0.0 and torch-2.1.01. Please replace training api according to sub table 'Affinity training api'
" + ], + "text/plain": [ + "+---------------------+---------------------------------------------------------------------------------+-------------------------------------------------------------------------------+\n", + "| problem | description | suggestion |\n", + "+---------------------+---------------------------------------------------------------------------------+-------------------------------------------------------------------------------+\n", + "| timeline_fusion_ops | Found 2 apis to be replaced based on the runtime env cann-8.0.0 and torch-2.1.0 | 1. Please replace training api according to sub table 'Affinity training api' |\n", + "+---------------------+---------------------------------------------------------------------------------+-------------------------------------------------------------------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_column_num = 3\n", + "problems = timeline_fusion_ops_result.get(\"problems\")\n", + "problem_table = PrettyTable(problems.get(\"headers\")[:display_column_num])\n", + "for row in problems.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=80)\n", + " problem_table.add_row(row[:display_column_num])\n", + "\n", + "display(problem_table)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "如下所示,存在亲和优化器和梯度裁剪两个可替换的torch_npu api,并给出了具体的堆栈。" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Affinity APICode stacksStack called counts
optimizer.clip_grad_norm_fused_/home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site-
packages/torch/nn/utils/clip_grad.py(49): clip_grad_norm_; /home/ma-
user/work/algorithms/doc_cls/Bert.py(205): train_epoch; /home/ma-
user/work/algorithms/doc_cls/Bert.py(252): <module>
2
torch_npu.optim.NpuFusedAdamW/home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site-
packages/torch_npu/npu/profiler.py(675): __enter__; /home/ma-
user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site-
packages/torch_npu/npu/profiler.py(719): wrapper; /home/ma-
user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site-
packages/torch/optim/lr_scheduler.py(65): wrapper; /home/ma-
user/work/algorithms/doc_cls/Bert.py(219): train_epoch; /home/ma-
user/work/algorithms/doc_cls/Bert.py(252): <module>
2
" + ], + "text/plain": [ + "+---------------------------------+-----------------------------------------------------------------------+---------------------+\n", + "| Affinity API | Code stacks | Stack called counts |\n", + "+---------------------------------+-----------------------------------------------------------------------+---------------------+\n", + "| optimizer.clip_grad_norm_fused_ | /home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- | 2 |\n", + "| | packages/torch/nn/utils/clip_grad.py(49): clip_grad_norm_; /home/ma- | |\n", + "| | user/work/algorithms/doc_cls/Bert.py(205): train_epoch; /home/ma- | |\n", + "| | user/work/algorithms/doc_cls/Bert.py(252): | |\n", + "+---------------------------------+-----------------------------------------------------------------------+---------------------+\n", + "| torch_npu.optim.NpuFusedAdamW | /home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- | 2 |\n", + "| | packages/torch_npu/npu/profiler.py(675): __enter__; /home/ma- | |\n", + "| | user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- | |\n", + "| | packages/torch_npu/npu/profiler.py(719): wrapper; /home/ma- | |\n", + "| | user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- | |\n", + "| | packages/torch/optim/lr_scheduler.py(65): wrapper; /home/ma- | |\n", + "| | user/work/algorithms/doc_cls/Bert.py(219): train_epoch; /home/ma- | |\n", + "| | user/work/algorithms/doc_cls/Bert.py(252): | |\n", + "+---------------------------------+-----------------------------------------------------------------------+---------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fusion_ops_api = timeline_fusion_ops_result.get(\"timeline_fusion_ops\")\n", + "if fusion_ops_api:\n", + " fusion_ops_api_table = PrettyTable(fusion_ops_api.get(\"headers\"))\n", + "\n", + " for row in fusion_ops_api.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=80)\n", + " fusion_ops_api_table.add_row(row)\n", + "\n", + " fusion_ops_api_table.hrules = ALL\n", + " display(fusion_ops_api_table)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/profiler/advisor_review/img/advisor_result.PNG b/profiler/advisor_review/img/advisor_result.PNG new file mode 100644 index 0000000000000000000000000000000000000000..a9652f4ca53ff142a5ebd1033075aad54f8f0297 Binary files /dev/null and b/profiler/advisor_review/img/advisor_result.PNG differ diff --git a/profiler/advisor_review/img/jupyter_report.PNG b/profiler/advisor_review/img/jupyter_report.PNG new file mode 100644 index 0000000000000000000000000000000000000000..baa860a7893e1801337916aea37475ea69bbaf04 Binary files /dev/null and b/profiler/advisor_review/img/jupyter_report.PNG differ diff --git a/profiler/advisor_review/interface/__init__.py b/profiler/advisor_review/interface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/interface/interface.py b/profiler/advisor_review/interface/interface.py new file mode 100644 index 0000000000000000000000000000000000000000..231f595d70b7e9dd6ee436153dc24259cfef640b --- /dev/null +++ b/profiler/advisor_review/interface/interface.py @@ -0,0 +1,75 @@ +import os +from collections import OrderedDict +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "cluster_analyse")) +sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "compare_tools")) + +from profiler.advisor.utils.utils import Timer +from profiler.advisor.analyzer.computation.profiling_analyzer import AicpuAnalyzer, BlockDimAnalyzer, DynamicShapeAnalyzer, OperatorBoundAnalyzer +from profiler.advisor.analyzer.schedule.fusion_ops.fusion_ops_analyzer import TimelineFusionOpsAnalyzer +from profiler.advisor.analyzer.graph_fusion.graph_fusion_analyzer import FusionOPAnalyzer +from profiler.advisor.common.analyzer_scopes import SupportedScopes +from profiler.advisor.analyzer.cluster.slow_rank_analyser import SlowRankAnalyzer +from profiler.advisor.analyzer.cluster.slow_link_analyser import SlowLinkAnalyzer +from profiler.advisor.analyzer.overall.overall_summary_analyzer import OverallSummaryAnalyzer +from profiler.advisor.analyzer.schedule.dispatch.timeline_op_dispatch_analyzer import OpDispatchAnalyzer + +class Interface: + supported_analyzer = { + "schedule": OrderedDict({ + SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer + }), + "computation": OrderedDict({ + SupportedScopes.DYNAMIC_SHAPE_ANALYSIS: DynamicShapeAnalyzer, + SupportedScopes.AICPU_ANALYSIS: AicpuAnalyzer, + SupportedScopes.OPERATOR_NO_BOUND_ANALYSIS: OperatorBoundAnalyzer, + SupportedScopes.BLOCK_DIM_ANALYSIS: BlockDimAnalyzer, + SupportedScopes.GRAPH: FusionOPAnalyzer, + SupportedScopes.TIMELINE_OP_DISPATCH: OpDispatchAnalyzer + }), + "communication": OrderedDict(), + "overall": OrderedDict({SupportedScopes.OVER_ALL: OverallSummaryAnalyzer}), + "dataloader": OrderedDict(), + "cluster": OrderedDict({ + SupportedScopes.SLOW_RANK: SlowRankAnalyzer, + SupportedScopes.SLOW_LINK: SlowLinkAnalyzer + }) + } + + all_dimension = list(supported_analyzer.keys()) + + def __init__(self, **kwargs): + self.collection_path = os.path.realpath(kwargs.get("profiling_path")) + + @staticmethod + def get_scope(dimension): + return list(Interface.supported_analyzer.get(dimension).keys()) + + @staticmethod + def get_analyzer(dimension, scope): + return Interface.supported_analyzer.get(dimension).get(scope) + + def get_result(self: any, dimension: str, scope: str, render_html=False, output_dict=True, **kwargs): + """ + :Param mode: affinity apis, ai cpu and so on. + """ + if dimension not in self.all_dimension: + raise ValueError(f"Error dimension {dimension}, supported dimensions are {self.all_dimension}") + + supported_scopes = self.get_scope(dimension) + if scope not in supported_scopes: + raise ValueError(f"Error scope {scope}, supported scopes are {supported_scopes}") + + analyzer = self.get_analyzer(dimension, scope)(collection_path=self.collection_path, **kwargs) + result = analyzer.optimize(**kwargs) + + if render_html and result.data: + if hasattr(analyzer, "html_render"): + analyzer.html_render.render_html() + analyzer.html_render.save_to_file(f'att_advisor_{Timer().strftime}.html') + + return result if not output_dict else dict(result.data) + + +if __name__ == "__main__": + Interface() diff --git a/profiler/advisor_review/result/__init__.py b/profiler/advisor_review/result/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/result/item.py b/profiler/advisor_review/result/item.py new file mode 100644 index 0000000000000000000000000000000000000000..fa0ffb5b1c769dd5e7a0d69523d0c94a65ffaf19 --- /dev/null +++ b/profiler/advisor_review/result/item.py @@ -0,0 +1,61 @@ +class OptimizeItem: + + def __init__(self, problem, description, suggestion): + self.problem = problem + self.description = description + self.suggestion = suggestion + + @property + def data(self): + format_suggestions = [] + for index, suggesion in enumerate(self.suggestion): + format_suggestions.append(f"{index + 1}. {suggesion}") + suggestion_str = "\n".join(format_suggestions) + return [self.problem, self.description, suggestion_str] + + @property + def headers(self): + return ["problem", "description", "suggestion"] + + +class StatisticsItem: + def __init__(self, total_task_duration, task_duration, count, income=None): + self.total_task_duration = total_task_duration + self.task_duration = task_duration + self.count = count + self.income = income + if not isinstance(task_duration, str): + self.task_duration_ratio = round(task_duration / total_task_duration, 4) if total_task_duration != 0 else 0 + else: + self.task_duration_ratio = "" + + @property + def data(self): + + def _cal_ratio(divisor, dividend): + if divisor and dividend != 0: + return divisor, round(divisor / dividend, 4) + else: + return "", "" + + income, income_ratio = _cal_ratio(self.income, self.total_task_duration) + return [self.count, self.total_task_duration, self.task_duration_ratio, income, income_ratio] + + @property + def headers(self): + return ["problem count", "total_time(us)", "time ratio", "income(us)", "income ratio"] + + +class OptimizeRecord: + + def __init__(self, optimization_item, statistics_item=None) -> None: + self.optimization_item = optimization_item + self.statistics_item = statistics_item or StatisticsItem("", "", "") + + @property + def data(self): + return self.optimization_item.data + self.statistics_item.data + + @property + def headers(self): + return self.optimization_item.headers + self.statistics_item.headers diff --git a/profiler/advisor_review/result/result.py b/profiler/advisor_review/result/result.py new file mode 100644 index 0000000000000000000000000000000000000000..c7d7da8663c8f0105734ec211e2a55a988030465 --- /dev/null +++ b/profiler/advisor_review/result/result.py @@ -0,0 +1,210 @@ +import json +import os +import stat +from textwrap import fill +from collections import OrderedDict + +import click +import xlsxwriter +from prettytable import ALL, PrettyTable + +from profiler.advisor.common import constant as const +from profiler.advisor.utils.utils import singleton, logger +from profiler.advisor.config.config import Config + + +class ResultWriter: + def __init__(self, result_path=None): + self.result_path = result_path + self.workbook = xlsxwriter.Workbook(result_path) + + self.header_format = None + self.data_cell_format = None + self._init_header_format() + self._init_data_cell_format() + + def _init_header_format(self): + self.header_format = self.workbook.add_format({ + "bold": True, + "color": "#FFFFFF", + "bg_color": "#187498", + "align": "center", + "border": 1, + "font_name": "Arial", + }) + + def _init_data_cell_format(self): + self.data_cell_format = self.workbook.add_format({ + "bold": False, + "align": "left", + "valign": "top", + "border": 1, + "font_name": "Arial", + 'text_wrap': True + }) + + def add_data(self, sheet_name, headers, data_list): + sheet = self.workbook.add_worksheet(sheet_name) + + if headers: + for col_index, header in enumerate(headers): + sheet.write(0, col_index, header, self.header_format) + + if data_list: + for i, row_data in enumerate(data_list): + row_index = i + 1 + for col_index, value in enumerate(row_data): + sheet.write(row_index, col_index, value, self.data_cell_format) + + sheet.autofit() + + def save(self): + try: + self.workbook.close() + except Exception as e: + logger.error("Failed to save analysis results, reason is %s", e) + + +@singleton +class SheetRecoder: + + def __init__(self): + self._sheet_data = OrderedDict() + + @property + def sheet_data(self): + return self._sheet_data + + def _init_sheet_name(self, sheet_name): + if sheet_name not in self._sheet_data: + self._sheet_data[sheet_name] = {} + + def add_headers(self, sheet_name, headers): + self._init_sheet_name(sheet_name) + + if self._sheet_data[sheet_name].get("headers") is None: + self._sheet_data[sheet_name]["headers"] = headers + + def add_data(self, sheet_name, data): + self._init_sheet_name(sheet_name) + + if not isinstance(self._sheet_data[sheet_name].get("data"), list): + self._sheet_data[sheet_name]["data"] = [] + if data not in self._sheet_data[sheet_name]["data"]: + self._sheet_data[sheet_name]["data"].append(data) + + +@singleton +class OptimizeResult: + + def __init__(self): + self.result_writer = ResultWriter(Config().analysis_result_file) + self.sheet_recorder = SheetRecoder() + self.page_dict = False + self._tune_op_list = [] + + @property + def data(self): + return self.sheet_recorder.sheet_data + + def add_tune_op_list(self, tune_op_list) -> None: + """ + add tune op name to tune op list + :param tune_op_list: tune op name list to be added + :return: None + """ + for op_name in tune_op_list: + if op_name not in self._tune_op_list: + self._tune_op_list.append(op_name) + + def add(self, overview_item): + sheet_name = "problems" + + headers = overview_item.headers + data = overview_item.data + self.sheet_recorder.add_headers(sheet_name, headers) + self.sheet_recorder.add_data(sheet_name, data) + + TerminalResult().add(overview_item.optimization_item.data) + self.page_dict = True + + def add_detail(self, sheet_name, headers=None, detail=None): + if headers: + self.sheet_recorder.add_headers(sheet_name, headers) + if detail: + self.sheet_recorder.add_data(sheet_name, detail) + self.page_dict = True + + def show(self): + for sheet_name, sheet_data in self.sheet_recorder.sheet_data.items(): + self.result_writer.add_data(sheet_name, sheet_data.get("headers"), sheet_data.get("data")) + + terminal_result = TerminalResult() + terminal_result.print() + if not terminal_result.result_list: + Config().remove_log() + return + self.result_writer.save() + logger.info("Save problems details file to %s", Config().analysis_result_file) + self._save_op_file_list() + + def _save_op_file_list(self) -> None: + if not self._tune_op_list: + return + tune_op_dict = {"tune_ops_name": self._tune_op_list} + tune_ops_file = Config().tune_ops_file + try: + + with os.fdopen(os.open(tune_ops_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR), + 'w', encoding="utf-8") as op_tune_file: + json.dump(tune_op_dict, op_tune_file) + except OSError as error: + logger.error("Dump op_list to %s failed, %s", tune_ops_file, error) + return + logger.info("Save tune op name list to %s", tune_ops_file) + + +@singleton +class TerminalResult: + """ + Result output to screen + """ + + def __init__(self): + self.width, _ = self.get_terminal_size() + if self.width is None: + self.table = PrettyTable(["No.", "Problem", "Description", "Suggestion"]) + else: + self.table = PrettyTable(["No.", "Problem", "Description", "Suggestion"], + max_table_width=max(self.width - 20, 180)) + self.table.hrules = ALL + self.result_list = [] + + @staticmethod + def get_terminal_size(): + try: + width, height = os.get_terminal_size() + except OSError: + width, height = None, None + return width, height + + def add(self, result_str): + """ + add a result str + """ + self.result_list.append(result_str) + + def print(self): + """ + print screen result with format table + """ + table_row_cnt = 0 + for result in self.result_list: + table_row_cnt += 1 + self.table.add_row([table_row_cnt] + result) + self.table.align = "l" + + if table_row_cnt > 0: + click.echo(self.table) + else: + click.echo(click.style(const.SKIP_ANALYZE_PROMPT, fg='red')) diff --git a/profiler/advisor_review/rules/__init__.py b/profiler/advisor_review/rules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/rules/aicpu_rules.yaml b/profiler/advisor_review/rules/aicpu_rules.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9313700c800d337eaea18f5a634521710f09e465 --- /dev/null +++ b/profiler/advisor_review/rules/aicpu_rules.yaml @@ -0,0 +1,103 @@ +DataTypeSuggeation: &DataTypeSuggeation "Data type {} in {} operator may cause AICPU issues, Try to convert to {} if possible." +AICPU_DOC_URL: &AICPU_DOC_URL "https://support.huaweicloud.com/bestpractice-modelarts/modelarts_10_2517.html" + +CommonChecker: + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ __ALL__ ] + ignore_type: [ cast, tensorequal, equal, nonzero, mul ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, int16, uint16, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, int16, uint16, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ cast ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ tensorequal ] + input: [ float, float32, float16, bool, int32, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ equal ] + input: [ float, float32, float16, bool, int32, int64, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ nonzero ] + input: [ float16, bool, dt_bf16 ] + output: [ int64 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ mul ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ __ALL__ ] + ignore_type: [ cast, tensorequal, equal, nonzero, mul ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, int16, complex64, complex128 ] + output: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, int16, complex64, complex128 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ cast ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ tensorequal ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ equal ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ mul ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, complex64 ] + output: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, complex64 ] + suggestion: *DataTypeSuggeation + +ExampleGuideChecker: + - IndexPutChecker: + op_type: [index] + url: *AICPU_DOC_URL + suggestion: 'Please modify source code followed by this LINK, try to replace index operator with equivalent operator.' + + - NonzeroChecker: + op_type: [ indexput, indexputv2 ] + url: *AICPU_DOC_URL + suggestion: 'Please modify source code followed by this LINK, try to replace indexput operator with equivalent operator.' + + - CastChecker: + op_type: [ argmin ] + url: *AICPU_DOC_URL + suggestion: 'Please update your cann-tookit to at least 7.0.RC1 version by this LINK.' + + - CastChecker: + op_type: [ nonzero ] + url: *AICPU_DOC_URL + suggestion: 'Please modify source code followed by this LINK, try to replace nonzero operator with equivalent operator.' \ No newline at end of file diff --git a/profiler/advisor_review/rules/op_fusion_pass.yaml b/profiler/advisor_review/rules/op_fusion_pass.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3ff69a578285ba15d075f2acbb852499d56021a2 --- /dev/null +++ b/profiler/advisor_review/rules/op_fusion_pass.yaml @@ -0,0 +1,491 @@ +Elementwise: &Elementwise [ Relu, Pow, Add, Sub, Mul, Div, Abs, Ceil, Log, Sqrt, Exp, LeakyRelu ] + +GraphFusion: + - FlashAttentionFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ Mul ] + - node_3: [ Softmax, SoftmaxV2 ] + - node_4: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + + - FlashAttentionFusionPass_V2: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ Mul ] + - node_3: [ TransData ] + - node_4: [ Softmax, SoftmaxV2 ] + - node_5: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + - [ node_4, node_5 ] + + - BMMStridedSliceDGeluFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [StridedSliceD] + - node_3: [Relu] + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + + - BMMConfusionTransposeDFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ ConfusionTransposeD ] + - node_3: [ Relu ] + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + + - BMMConfusionTransposeDFusionPass_V2: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ ConfusionTransposeD ] + edges: + - [ node_1, node_2 ] + + - Conv2DAddGroupNormFusionPass: + version: 0 + struct: [ Conv2D, Add, GroupNorm ] + + - RMSnormAddFusionPass: + version: 0 + struct: [ RMSnorm, Add ] + + - ConvToFullyConnectionFusionPass: + version: 0 + struct: [ Conv ] + + - ZConcatv2dFusionPass: + version: 0 + struct: [ ConcatV2d, ConcatV2d ] + + - BatchMatMulReduceMeanFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ Add ] + - node_3: [ Relu ] + - node_4: [ ReduceMean ] + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + + - PadDepthwiseConv2dFusionPass: + version: 0 + struct: [ PadD, DepthwiseConv2D ] + + - ConvBatchnormFusionPass: + version: 1 + nodes: + - node_1: [ Conv2d, Conv3d, DepthwiseConv2d ] + - node_2: [ Batchnorm ] + + edges: + - [ node_1, node_2 ] + + - AConv2dMulFusion: + version: 1 + nodes: + - node_1: [ Conv2d, Conv3d ] + - node_2: [ Mul ] + + edges: + - [ node_1, node_2 ] + + - TBEConvAddFusion: + version: 1 + nodes: + - node_1: [ Conv2d, Conv3d ] + - node_2: [ Add ] + + edges: + - [ node_1, node_2 ] + + - ZBNupdateReluV2Conv2DBNreducePass: + version: 0 + struct: [ BNTrainingUpdate, ReluV2, Conv2D, BNTrainingReduce ] + + - ASplitConv2dConcatPass: + version: 1 + nodes: + - node_1: [ MatMul, MatMulV2, BatchMatMul, BatchMatMulV2 ] + - node_2: [ Cast ] + + edges: + - [ node_1, node_2 ] + + - MatMulBiasAddFusionPass: + version: 1 + nodes: + - node_1: [ MatMul, MatMulV2, BatchMatMul, BatchMatMulV2 ] + - node_2: [ BiasAdd, Add ] + + edges: + - [ node_1, node_2 ] + + - Conv2DbpInputBiasAddFusionPass: + version: 0 + struct: [ Conv2DBackpropInput, BiasAdd ] + + - BatchMatmulV2ReduceFusionPass: + version: 0 + struct: [ BatchMatMulV2, ReduceSumD ] + + - BatchMatmulV2ReduceFusionPass_V2: + version: 0 + struct: [ BatchMatMulV2, Cast, ReduceSumD ] + + - Conv3DbpInputBiasAddFusionPass: + version: 0 + struct: [ Conv3DBackpropInputD, BiasAdd ] + + - AFullyConnectionReshapePass: + version: 0 + struct: [ FullyConnection, Reshape ] + + - GemmTransFusionPass: + version: 0 + struct: [ Transpose, Gemm ] + + - Resnet50DbnDwFusionPass: + version: 0 + struct: [ BNTrainingReduceGrad, Conv2DBackpropFilterD ] + + - CastReluCastFusionPass: + version: 0 + struct: [ Cast, Relu, Cast ] + + - PadConv2dFusionPass: + version: 1 + nodes: + - node_1: [ PadD, PadDV3 ] + - node_2: [ Conv2D ] + + edges: + - [ node_1, node_2 ] + + - Conv2DTransposeBatchnormFusionPass: + version: 1 + nodes: + - node_1: [ Conv2dTranspose ] + - node_2: [ BatchNorm, BNInference ] + + edges: + - [ node_1, node_2 ] + + - AvgPoolV2GradFusionPass: + version: 0 + struct: [ AvgPooV2lGrad ] + + - DropOutDoMaskFusionPass: + version: 0 + struct: [ DropOutDoMaskV3D ] + + - ConvCastFusionPass: + version: 0 + struct: [ Conv2D, Cast ] + + - ConvCastFusionPass_V2: + version: 0 + struct: [ Conv2D, TransData, Cast ] + + - StridedSliceConcatFusionPass: + version: 1 + nodes: + - node_1: [ StridedSliceD ] + - node_2: [ StridedSliceD ] + - node_3: [ ConcatD ] + + edges: + - [ node_1, node_3 ] + - [ node_2, node_3 ] + + - ConvCastFusionPass: + version: 0 + struct: [ SplitV ] + + - AInplaceAddFusionPass: + version: 0 + struct: [ InplaceAdd ] + + - AInplaceSubFusionPass: + version: 0 + struct: [ InplaceSub ] + + - AInplaceUpdateFusionPass: + version: 0 + struct: [ InplaceUpdate ] + +UBFusion: + - TbeConv3dElemwisePass: + version: 1 + nodes: + - node_1: [ Conv3D ] + - node_2: *Elementwise + edges: + - [ node_1, node_2 ] + + - TbeConv3dDxElemwisePass: + version: 0 + struct: [ Conv3dBackpropInput, AddN, LeakyReluGrad ] + + - TbeConv3dDxElemwisePass_V2: + version: 0 + struct: [ Conv3dBackpropInput, LeakyReluGrad ] + + - MatMulDropoutDoMaskV3dFusionPass: + version: 0 + struct: [ MatMul, Dropout_do_mask_v3_d, Add ] + + - BatchMatMulDropoutDoMaskV3dFusionPass: + version: 0 + struct: [ BatchMatMul, Dropout_do_mask_v3_d, Add ] + + - MatmulReduceSumUbFusion: + version: 0 + struct: [ BatchMatMul, ReduceSum ] + + - TbeBatchMatMulElementWiseFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMul, GEMM ] + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - ATbeMatMulElemwiseFusionPass: + version: 1 + nodes: + - node_1: [ MatMul, GEMM ] + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - MatmulConfusiontransposeUbFusion: + version: 0 + struct: [ MatMul, matmul_transpose ] + + - TbeFullyconnectionElemwiseDequantFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMul, MatMul, FullyConnection ] + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - BatchMatmulConfusiontransposeUbFusion: + version: 0 + struct: [ BatchMatMul, batchmatmul_transpose ] + + - TbeConvSigmoidMulQuantFusionPass: + version: 1 + nodes: + - node_1: [ Conv ] + - node_2: [ Sigmoid ] + - node_3: [ Mul ] + - node_4: [ Quant ] + + edges: + - [ node_1, node_2 ] + - [ node_1, node_3 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + + - TbeConv2DReluv2Pass: + version: 0 + struct: [ Conv2D, ReluV2 ] + + - TbeConvDoubleInFusionPass: + version: 1 + nodes: + - node_1: [ Conv2D ] + - node_2: *Elementwise + - node_3: *Elementwise + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + + - TbeConv2dAddClipMulDivFusionPass: + version: 0 + struct: [ Conv2D, Add, Clip, Mul, Div ] + + - TbeConv2dAddClipMulDivFusionPass_V2: + version: 0 + struct: [ Conv2D, Add, Clip, Mul ] + + - TbeConv2dAddRelu6MulMulFusionPass: + version: 1 + nodes: + - node_1: [ Conv2D, DepthwiseConv2D ] + - node_2: [ Add ] + - node_3: [ Relu6 ] + - node_4: [ Mul ] + - node_5: [ Mul ] + + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + - [ node_4, node_5 ] + + - ConvClipByValueFusionPass: + version: 1 + nodes: + - node_1: [ Conv2D ] + - node_2: *Elementwise + edges: + - [ node_1, node_2 ] + + - TbeAippConvReluMaxpoolingFusion: + version: 1 + nodes: + - node_1: [ Conv2D ] + - node_2: *Elementwise + - node_3: [ MaxPool, MaxPoolv3 ] + + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + + - TbeReduceElemwiseFusionPass: + version: 1 + nodes: + - node_1: *Elementwise + - node_2: [ CommReduce ] + edges: + - [ node_1, node_2 ] + + - TbeReadSelectEltwiseFusionPass: + version: 1 + nodes: + - node_1: [ ReadSelect ] + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - TbeEltwiseWriteSelectFusionPass: + version: 1 + nodes: + - node_1: *Elementwise + - node_2: [ write_select ] + + edges: + - [ node_1, node_2 ] + + - TbeEltwiseFusionPass: + version: 1 + nodes: + - node_1: *Elementwise + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - TbeConvBnreduceFusionPass: + version: 0 + struct: [ Convolution, bn_reduce ] + + - TbeBnupdateEltwiseFusionPass: + version: 1 + nodes: + - node_1: [ bn_update ] + - node_2: *Elementwise + edges: + - [ node_1, node_2 ] + + - TbeConv2DBackpropElemwiseFusionPass: + version: 1 + nodes: + - node_1: [ Conv2DBackpropInputD, Conv2DTransposeD, Deconvolution ] + - node_2: [ Add, ReluGradV2 ] + + edges: + - [ node_1, node_2 ] + + - TbeDxElemwisePass: + version: 1 + nodes: + - node_1: [ Conv2DBackpropInputD, Conv2DTransposeD, Deconvolution ] + - node_2: [ LeakyRelu, Prelu ] + + edges: + - [ node_1, node_2 ] + + - TbeConv2dBackpropRequantFusionPass: + version: 1 + nodes: + - node_1: [ Conv2DBackpropInputD, Conv2DTransposeD, Deconvolution ] + - node_2: [ AscendRequant ] + + edges: + - [ node_1, node_2 ] + + - TbeDwTransdataFusionPass: + version: 1 + nodes: + - node_1: [ Transdate ] + - node_2: [ Transdate ] + - node_3: [ Conv2DBackpropFilter ] + + edges: + - [ node_1, node_3 ] + - [ node_2, node_3 ] + + - TbeDxTransdataFusionPass: + version: 1 + nodes: + - node_1: [ Transdate ] + - node_2: [ Transdate ] + - node_3: [ Conv2DBackpropInput ] + + edges: + - [ node_1, node_3 ] + - [ node_2, node_3 ] + + - TbeEltwiseCastFusionPass: + version: 1 + nodes: + - node_1: [ Relu, Add, Mul, Sqrt ] + - node_2: [ Cast ] + + edges: + - [ node_1, node_2 ] + + - TbeEltwiseCastFusionPass_V2: + version: 1 + nodes: + - node_1: [ Cast ] + - node_2: [ Relu, Add, Mul, Sqrt ] + + + edges: + - [ node_1, node_2 ] + + - TbeConv2DBackpropDequantFusionPass: + version: 1 + nodes: + - node_1: [ Conv2DBackpropInputD, Conv2DTransposeD, Deconvolution ] + - node_2: [ AscendDequant ] + + + edges: + - [ node_1, node_2 ] diff --git a/profiler/advisor_review/rules/timeline_fusion_ops.yaml b/profiler/advisor_review/rules/timeline_fusion_ops.yaml new file mode 100644 index 0000000000000000000000000000000000000000..10c12ff18dd8792e24a89c6d5fbb7ed87f643a9d --- /dev/null +++ b/profiler/advisor_review/rules/timeline_fusion_ops.yaml @@ -0,0 +1,59 @@ +- cann_version: 6.3.RC2 + torch_version: 1.11.0 + unique_id: 0 + operator_rules: + aten: + add: + torch_npu.npu_confusion_transpose: ["(permute|transpose)-(contiguous){0,1}-(reshape|view)", + "(reshape|view)-(contiguous){0,1}-(permute|transpose)"] + torch_npu.fast_gelu: [gelu] + torch_npu.npu_linear: [linear] + torch_npu.npu_mish: [mish] + torch_npu.contrib.module.Mish: [mish] + torch_npu.npu_scaled_masked_softmax: [ "softmax-(mul){0,1}-(masked_fill_|add)" ] + torch_npu.npu_silu: [ silu, mul-sigmoid, sigmoid-mul ] + torch_npu.contrib.module.SiLU: [ silu, mul-sigmoid, sigmoid-mul ] + optimizer.clip_grad_norm_fused_: [add-reciprocal-mul] + Optimizer: + add: + torch_npu.optim.NpuFusedAdamW: [AdamW.step] + torch_npu.optim.NpuFusedSGD: [SGD.step] + torch_npu.optim.NpuFusedAdadelta: [Adadelta.step] + torch_npu.optim.NpuFusedLamb: [Lamb.step] + torch_npu.optim.NpuFusedAdamP: [AdamP.step] + torch_npu.optim.NpuFusedBertAdam: [BertAdam.step] + torch_npu.optim.NpuFusedRMSprop: [RMSprop.step] + torch_npu.optim.NpuFusedRMSpropTF: [RMSpropTF.step] + torch_npu.optim.NpuFusedAdam: [Adam.step] + + +- cann_version: 7.0.RC1 + torch_version: [1.11.0,2.1.0] + unique_id: 1 + inherit_unique_id: 0 + operator_rules: + aten: + add: + torch_npu.npu_fusion_attention: ["matmul-(add){0,1}-(mul){0,1}-(masked_fill_|add){0,1}-softmax-(dropout){0,1}-matmul"] + torch_npu.npu_rotary_mul: ["(chunk|slice)-neg-cat-(mul){0,2}-add"] + +- cann_version: 7.0.0 + torch_version: [1.11.0, 2.1.0] + unique_id: 2 + inherit_unique_id: 1 + operator_rules: + aten: + add: + torch_npu.npu_rms_norm: ["(pow){0,1}-(mean){0,1}-(add){0,1}-rsqrt-mul-(type_as){0,1}"] + torch_npu.npu_swiglu: [ "(slice|chunk)-silu-mul", "(slice|chunk)-mul-silu", + "(slice|chunk)-sigmoid-mul-mul", "(slice|chunk)-mul-sigmoid-mul", + "(slice|chunk)-mul-mul-sigmoid" ] + +- cann_version: 8.0.0 + torch_version: [1.11.0, 2.1.0] + unique_id: 3 + inherit_unique_id: 2 + operator_rules: + aten: + add: + torch_npu.npu_geglu: ["(slice|chunk)-gelu-mul", "(slice|chunk)-mul-gelu"] \ No newline at end of file diff --git a/profiler/advisor_review/utils/__init__.py b/profiler/advisor_review/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor_review/utils/log.py b/profiler/advisor_review/utils/log.py new file mode 100644 index 0000000000000000000000000000000000000000..b18272a82b6c5f529e5d36ceca921734eba9f592 --- /dev/null +++ b/profiler/advisor_review/utils/log.py @@ -0,0 +1,63 @@ +""" +log module +""" +import logging +import os + +from profiler.advisor.common import constant as const + + +def get_log_level(): + log_level = os.getenv(const.ADVISOR_LOG_LEVEL, const.DEFAULT_LOG_LEVEL).upper() + if not hasattr(logging, log_level): + raise AttributeError(f"module 'logging' has no attribute '{log_level}', " + f"supported log level: {', '.join(const.SUPPORTED_LOG_LEVEL)}") + return log_level + + +def init_logger(ctx, param, debug_mode) -> logging.Logger: + logging.logThreads = False + logging.logMultiprocessing = False + logging.logProcesses = False + + class LevelFilter(logging.Filter): + """ + level filter, filer only log with level out + """ + + # pylint:disable=too-few-public-methods + def filter(self, record): + if record.levelno == 60: + return False + return True + + console_log_level = getattr(logging, get_log_level()) + console_handle = logging.StreamHandler() + console_handle.setLevel(console_log_level) + console_handle.addFilter(LevelFilter()) + if debug_mode and not ctx.resilient_parsing: + formatter = logging.Formatter(fmt="[%(asctime)s][%(levelname)s][%(filename)s L%(lineno)s] %(message)s", + datefmt='%Y-%m-%d,%H:%M:%S') + else: + formatter = logging.Formatter(fmt="[%(asctime)s][%(levelname)s] %(message)s", + datefmt='%Y-%m-%d,%H:%M:%S') + console_handle.setFormatter(formatter) + + # add log level out + logging.addLevelName(60, 'OUT') + logger = logging.getLogger() + setattr(logger, 'out', lambda *args: logger.log(60, *args)) + output_handle = logging.StreamHandler() + output_handle.setLevel("OUT") + formatter = logging.Formatter("%(message)s") + output_handle.setFormatter(formatter) + + logger.setLevel("DEBUG") + logger.handlers = [] + if not logger.handlers: + logger.addHandler(console_handle) + logger.addHandler(output_handle) + else: + logger.info(logger.handlers) + logger.debug("The logger of analysis have initialized successfully.") + return logger diff --git a/profiler/advisor_review/utils/tools.py b/profiler/advisor_review/utils/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..2cbcb5e0521d4a947fb8ff6af40e98c32dedab23 --- /dev/null +++ b/profiler/advisor_review/utils/tools.py @@ -0,0 +1,76 @@ +from functools import partial + +import click + +CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help']) + + +class ClickAliasedGroup(click.Group): + """ + Alias click command + """ + FORMAT_LIMIT_LEN = 6 + + def __init__(self, *args, **kwargs): + super(ClickAliasedGroup, self).__init__(*args, **kwargs) + self._alias_dict = {} + self._commands = {} + + def command(self, *args, **kwargs): + alias = kwargs.pop('alias', None) + decorator = super(ClickAliasedGroup, self).command(*args, **kwargs) + if not alias: + return decorator + + return partial(self._decorator_warpper, decorator, alias) + + def group(self, *args, **kwargs): + alias = kwargs.pop('alias', None) + decorator = super(ClickAliasedGroup, self).group(*args, **kwargs) + if not alias: + return decorator + + return partial(self._decorator_warpper, decorator, alias) + + def _decorator_warpper(self, decorator, alias, func=None): + cmd = decorator(func) + self._commands[cmd.name] = alias + self._alias_dict[alias] = cmd.name + return cmd + + def resolve_alias(self, cmd_name): + if cmd_name in self._alias_dict.keys(): + return self._alias_dict[cmd_name] + return cmd_name + + def get_command(self, ctx, cmd_name): + cmd_name = self.resolve_alias(cmd_name) + command = super(ClickAliasedGroup, self).get_command(ctx, cmd_name) + return command if command else None + + def format_commands(self, ctx, formatter): + rows = [] + sub_commands = self.list_commands(ctx) + max_len = 0 + if len(sub_commands) > 0: + max_len = max(len(cmd) for cmd in sub_commands) + + limit = formatter.width - self.FORMAT_LIMIT_LEN - max_len + for sub_command in sub_commands: + cmd = self.get_command(ctx, sub_command) + if cmd is None: + continue + if hasattr(cmd, 'hidden') and cmd.hidden: + continue + if sub_command in self._commands: + alias = self._commands[sub_command] + sub_command = f'{sub_command}, {alias}' + if click.__version__[0] < '7': + cmd_help = cmd.short_help or '' + else: + cmd_help = cmd.get_short_help_str(limit) + rows.append((sub_command, cmd_help)) + + if rows: + with formatter.section('Commands'): + formatter.write_dl(rows) diff --git a/profiler/advisor_review/utils/utils.py b/profiler/advisor_review/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..84419b67087f8a434361f77479899d10ef91b9f5 --- /dev/null +++ b/profiler/advisor_review/utils/utils.py @@ -0,0 +1,552 @@ +import inspect +import json +import logging +import multiprocessing as mp +import os +import queue +import re +import stat +import time +import traceback +import types +from functools import wraps +from typing import Any, Set + +import click +import requests +from requests.adapters import HTTPAdapter +from tqdm import tqdm + +from profiler.advisor.common import constant as const +from profiler.advisor.common.version_control import VersionControl +from profiler.advisor.utils.log import init_logger, get_log_level + +logger = logging.getLogger() +logger.setLevel(get_log_level()) +permission_warned: Set = set() + + +def ignore_warning(exception: Exception = None): + return exception + + +class ContextObject(object): + def __init__(self): + self._debug = False + + def set_debug(self, debug=False): + self._debug = debug + + @property + def debug_mode(self): + return self._debug + + +def debug_option(f): + return click.option('--debug', '-D', + is_flag=True, + expose_value=False, + is_eager=True, + callback=init_logger, + help="Debug Mode. Shows full stack trace when error occurs.")(f) + + +def get_class_absolute_path(cls): + module = inspect.getmodule(cls) + if module is not None: + module_path = module.__name__ + class_name = cls.__name__ + return f"{module_path}.{class_name}" + else: + return None + + +def is_static_func(function_obj): + return isinstance(function_obj, staticmethod) + + +def singleton(cls): + """ + :param cls: any class + :return: singleton handle + + When using the singleton function, you need to manually specify collection_path='dataSet_path'. Otherwise, the singleton function + is initialized by class name. + if cls has 'collection_path' property, _instance map will build by class_name and 'collection_path', the default value of + collection path is class absolute path. + + _instance = {cls.name: {collection_path: instance}} + """ + _instance = {} + + def _singleton(*args: any, **kw: any) -> any: + collection_path = kw.get("collection_path") + if not collection_path: + collection_path = get_class_absolute_path(cls) + if cls in _instance and collection_path in _instance[cls]: + return _instance[cls].get(collection_path) + if cls not in _instance: + _instance[cls] = {collection_path: cls(*args, **kw)} + else: + _instance[cls][collection_path] = cls(*args, **kw) + return _instance[cls].get(collection_path) + + # 保留原始类的属性和方法 + _singleton.__name__ = cls.__name__ + _singleton.__module__ = cls.__module__ + _singleton.__doc__ = cls.__doc__ + + # 拷贝原始类的类方法和静态方法 + _singleton.__dict__.update(cls.__dict__) + for base_class in inspect.getmro(cls)[::-1]: + # 获取类的所有成员 + members = inspect.getmembers(base_class) + + # 过滤出函数对象 + function_objs = [member[1] for member in members if inspect.isfunction(member[1]) or inspect.ismethod(member[1])] + for function_obj in function_objs: + if inspect.isfunction(function_obj) and not is_static_func(function_obj): + continue + setattr(_singleton, function_obj.__name__, function_obj) + + return _singleton + + +def lazy_property(func): + """ + Lazy loading of class attributes. + which is calculated only once when it is called for the first time, + and will not be repeated for each call after that. + """ + attr_name = "_lazy_" + func.__name__ + + @property + def _lazy_property(instance): + if not hasattr(instance, attr_name): + setattr(instance, attr_name, func(instance)) + return getattr(instance, attr_name) + + return _lazy_property + + +class CheckPathAccess: + """ + check path access permissions + """ + + # pylint: disable=no-member + def __init__(self, func): + wraps(func)(self) + self.warned = permission_warned + + def __call__(self, *args, **kwargs): + path = args[0] + if not os.access(path, os.R_OK) and path not in self.warned: + logger.warning("%s can not read, check the permissions", path) + self.warned.add(path) + return self.__wrapped__(*args, **kwargs) + + def __get__(self, instance, cls): + if instance is None: + return self + return types.MethodType(self, instance) + + +def walk_error_handler(error): + """ + handle dir walk error + """ + if error.filename not in permission_warned: + logger.warning(error) + permission_warned.add(error.filename) + + +@CheckPathAccess +def get_file_path_from_directory(path: str, check_func: Any) -> list: + """ + get file from directory + """ + file_list = [] + for root, _, files in os.walk(path, onerror=walk_error_handler): + for filename in files: + filepath = os.path.join(root, filename) + if check_func(filename): + file_list.append(filepath) + return file_list + + +@singleton +class Timer: + def __init__(self): + self.strftime = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())) + + +def get_analyze_processes(): + # n_processes not exposed to user through att-advisor command arguments now + return min(int(os.getenv(const.MA_ADVISOR_ANALYZE_PROCESSES, 1)), const.MA_ADVISOR_MAX_PROCESSES) + + +def format_timeline_result(result: dict, dump_html=False): + """ + :Param result: json for api name and stack + :Return: json after format + """ + format_result = {} + if dump_html: + result = json.loads(json.dumps(result).replace("\\r\\n", "
").replace("", "<module>")) + + for key, stacks in result.items(): + api_name = key.split(":")[0] + format_result[api_name] = sorted(list(stacks.items()), key=lambda stack: stack[1], reverse=True) + return format_result + + +class ParallelJob: + + def __init__(self, src_func, ops_api_list, job_name=None): + if not callable(src_func): + raise TypeError(f"src_func should be callable") + + if not isinstance(ops_api_list, (list, tuple)): + raise TypeError(f"ops_api_list should be list or tuple") + + self.src_func = src_func + self.ops_api_list = ops_api_list + self.job_name = job_name + + def start(self, n_proccesses): + + job_queue = mp.Queue(len(self.ops_api_list)) + completed_queue = mp.Queue() + for i in range(len(self.ops_api_list)): + job_queue.put(i) + + processes = [] + listen = mp.Process(target=self.listener, args=(completed_queue, len(self.ops_api_list),)) + listen.start() + + for i in range(n_proccesses): + p = mp.Process(target=self.parallel_queue, args=(job_queue, completed_queue,)) + processes.append(p) + p.start() + + for p in processes: + p.join() + + completed_queue.put(None) + listen.join() + + def listener(self, completed_queue, num): + pbar = tqdm(total=num, position=0, leave=False, ncols=100, desc=self.job_name) + for _ in iter(completed_queue.get, None): + pbar.update() + pbar.refresh() + pbar.n = num + + def parallel_queue(self, job_queue, completed_queue): + while True: + try: + if job_queue.empty(): + break + token = job_queue.get(timeout=1) + except queue.Empty: + continue + self.src_func(*self.ops_api_list[token]) + completed_queue.put(token) + + +def mp_queue_to_list(job_queue): + queue_list = [] + while True: + try: + if job_queue.empty(): + break + token = job_queue.get(timeout=1) + queue_list.append(token) + except queue.Empty: + continue + return queue_list + + +def load_parameter(parameter, default): + if not os.environ.get(parameter, None): + return default + else: + return os.environ.get(parameter) + + +def get_supported_subclass(clazz: VersionControl.__class__, cann_version: str): + """ + Returns a list of subclasses that support the specified version, because of the __subclasses__(), + you need to import the all subclass first + :param clazz: Class name which is extends to VersionControl.__class__ + :param cann_version: The CANN software version + :return: The list of subclasses that support the specified CANN version + """ + # 获取所有支持这个cann版本的子类 + dataset_classes = clazz.__subclasses__() + sub_class_list = [cls for cls in dataset_classes if cls.is_supported(cann_version)] + logger.debug("The support subclass list is %s, cann version is %s", str(sub_class_list), cann_version) + return sub_class_list + + +def to_percent(num: float) -> str: + """ + change float to percent format + """ + num = num * 100 + return f"{num:.2f}%" + + +def safe_division(numerator, denominator): + """Return 0 if denominator is 0.""" + return denominator and numerator / denominator + + +def safe_write(content, save_path): + if os.path.dirname(save_path) != "": + os.makedirs(os.path.dirname(save_path), exist_ok=True) + + with os.fdopen(os.open(save_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP), "w") as f: + f.write(content) + + +def create_directory_for_file(file: str) -> None: + """ + create directory for file + """ + dirname = os.path.dirname(file) + if not os.path.exists(dirname): + os.makedirs(dirname) + + +class CheckPathAccess: + """ + check path access permissions + """ + + # pylint: disable=no-member + def __init__(self, func): + wraps(func)(self) + self.warned = permission_warned + + def __call__(self, *args, **kwargs): + path = args[0] + if path and not os.access(path, os.R_OK) and path not in self.warned: + logger.warning("%s can not read, check the permissions", path) + self.warned.add(path) + return self.__wrapped__(*args, **kwargs) + + def __get__(self, instance, cls): + if instance is None: + return self + return types.MethodType(self, instance) + + +@CheckPathAccess +def get_file_path_from_directory(path, check_func): + """ + get file from directory + """ + file_list = [] + + if not path: + return file_list + + if not os.path.isdir(path): + logger.warning("Expected existed directory, but got %s", path) + + for root, _, files in os.walk(path): + for filename in files: + filepath = os.path.join(root, filename) + if check_func(filename): + file_list.append(filepath) + return file_list + + +@CheckPathAccess +def get_dir_path_from_directory(path: str, check_func: Any) -> list: + """ + get file from directory + """ + file_list = [] + for root, _, files in os.walk(path, onerror=walk_error_handler): + for filename in files: + filepath = os.path.join(root, filename) + if check_func(filename): + file_list.append(filepath) + return file_list + + +def is_regex_pattern(string: str): + """ + Check if str is a regular expression. + """ + escaped_string = re.escape(string) + return not (escaped_string == string) + + +def join_prof_path(root_dir: str, sub_dir: str) -> str: + """ + regular expression matching method for path concatenation + """ + if is_regex_pattern(sub_dir): + for root, _, _ in os.walk(root_dir, onerror=walk_error_handler): + if re.match(sub_dir, os.path.basename(root)): + return root + logger.debug("Fail to get profiling path %s from local path %s by regular expression matching", sub_dir, root_dir) + else: + sub_dir = os.path.join(root_dir, sub_dir) + if os.path.exists(sub_dir): + return sub_dir + logger.debug("Fail to get profiling path %s from local path %s", sub_dir, root_dir) + return "" + + +def format_excel_title(title: str) -> str: + """ + format excel title + """ + title = title.lower() + title = title.replace("(us)", '') + title = title.replace("(ns)", '') + title = title.replace("(%)", '') + title = title.replace(" ", "_") + return title + + +def format_float(num: float) -> float: + """ + format float num, round to 2 decimal places + """ + return round(num, 2) + + +class SafeOpen: + """ + safe open to check file + """ + + # pylint: disable=consider-using-with + def __init__(self, name, mode='r', encoding=None): + self.file = None + if not os.path.exists(name): + logger.warning("%s not exist, please check", name) + return + + if os.access(name, os.R_OK): + self.file = open(name, mode, encoding=encoding, errors="ignore") + else: + logger.warning("%s can not read, check the permissions", name) + + def __enter__(self): + return self.file + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.file: + self.file.close() + return True + + +def save_downloaded_file(response, url_path, file_save_path): + """保存响应体中的文件 + + 参数: + response: 请求后获取的响应体 + url_path: url路径 + file_save_path: 保存路径 + 返回: + final_file_path: 文件保存绝对路径 + """ + # 获取url路径中的文件名, 拼接在保存路径下 + file_save_path = os.path.normpath(file_save_path) + file_name = os.path.basename(url_path) + final_file_path = os.path.join(file_save_path, file_name) + # 若目标保存路径不存在,则自动生成 + if not os.path.exists(file_save_path): + os.makedirs(file_save_path) + if response.status_code <= 300: + logger.debug("Response status code is %s", response.status_code) + flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL + modes = stat.S_IWUSR | stat.S_IRUSR + # 若文件已存在,则移除已有的文件并保存最新的文件 + if os.path.exists(final_file_path): + os.remove(final_file_path) + # 保存文件 + with os.fdopen(os.open(final_file_path, flags, modes), mode="wb") as f: + f.write(response.content) + logger.info("Success to save content in: %s", os.path.abspath(final_file_path)) + else: + # 若响应码不为预期的数值, 显示相应告警 + logger.warning("Failed to save the response body. The response status code is %s. " + "Please check the network or try another region", response.status_code) + + +def request_with_retry(url_path, region_name=None): + """使用requests请求获取文件, 失败则进行重试, 最多请求 max_retries+1 次 + + 参数: + url_path: URL路径 + file_save_path: 云文件保存路径 + """ + logger.debug("Requesting or retrying to get file from region: %s", region_name) + + # 若从环境变量指定了保存路径,优先从环境变量中获取,若为空则使用默认的云文件保存路径constant.CLOUD_RULE_PATH + file_save_path = os.path.join(os.path.expanduser("~"), const.CLOUD_RULE_PATH) + if os.getenv(const.ADVISOR_RULE_PATH): + file_save_path = os.getenv(const.ADVISOR_RULE_PATH) + + session = requests.Session() + # 使用session发起的所有请求, 默认最多会重试 max_retries 次, 计入最初请求, 最差情况下请求 max_retries+1 次 + adapter = HTTPAdapter(max_retries=const.MAX_RETRIES) + session.mount(const.HTTP_PREFIXES, adapter) + session.mount(const.HTTPS_PREFIXES, adapter) + + logger.debug('Session try to get response') + response = None + try: + response = session.get(url_path, timeout=const.TIMEOUT) + except Exception as e: + logger.debug("Error: %s: %s", e, traceback.format_exc()) + + if response is None: + logger.warning("Fail to download file from region: %s, response is None, " + "please use the environment variable %s for more detailed information", + region_name, const.ADVISOR_LOG_LEVEL) + else: + try: + # 若响应码为400~600之间,response.raise_for_status抛出HTTPError错误, 跳过调用save_downloaded_file函数逻辑 + response.raise_for_status() + save_downloaded_file(response, url_path=url_path, file_save_path=file_save_path) + except Exception as e: + logger.warning("Error: %s: %s", e, traceback.format_exc()) + # 关闭 session, 清除所有装配器 + session.close() + + +def read_csv(file): + import csv + + raw_data = [] + logger.debug("Parse file %s", file) + with SafeOpen(file, encoding="utf-8") as csv_file: + try: + csv_content = csv.reader(csv_file) + for row in csv_content: + raw_data.append(row) + except OSError as error: + logger.error("Read csv file failed : %s", error) + return [] + + return raw_data + + +def get_file_path_by_walk(root, filename): + file_path = "" + for root, _, files in os.walk(root, topdown=True): + for name in files: + if name == filename: + file_path = os.path.join(root, name) + return file_path + return file_path diff --git a/profiler/advisor_review/version.py b/profiler/advisor_review/version.py new file mode 100644 index 0000000000000000000000000000000000000000..1a95cc3c0f93f49a2aaacf483770462d09961ff9 --- /dev/null +++ b/profiler/advisor_review/version.py @@ -0,0 +1,38 @@ +import sys + + +def get_package_version(package_name) -> str: + """ + Get package version info by importlib + Args: + package_name: package name + + Returns: + version: version info string + """ + if sys.version_info >= (3, 8): + # Because importlib_metadata has been changed to importlib.metadata in py3.8 + from importlib import metadata + from importlib.metadata import PackageNotFoundError + else: + import importlib_metadata as metadata + from importlib_metadata import PackageNotFoundError + + try: + version = metadata.version(package_name) + except PackageNotFoundError: + version = "UNKNOWN" + return version + + +def print_version_callback(ctx, param, value): # NOQA + import click + + if not value or ctx.resilient_parsing: + return + click.echo('Version {}'.format(get_package_version("att_advisor"))) + ctx.exit() + + +def cli_version(): + return get_package_version("att_advisor") diff --git a/profiler/affinity_cpu_bind_review/README.md b/profiler/affinity_cpu_bind_review/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8c3b47ed5183fd2dbade8fc316e0319b8feea880 --- /dev/null +++ b/profiler/affinity_cpu_bind_review/README.md @@ -0,0 +1,40 @@ +# 昇腾亲和性CPU绑核工具 + +昇腾亲和性CPU绑核工具支持用户无需修改代码,直接运行工具即可实现按CPU亲和性策略绑核,提升推理或训练性能。 + +绑核工具用户arm服务器环境,对于训练或推理任务因为CPU资源调度等出现host_bound问题时使用,可改善该问题;对于非host_bound的场景无明显改善效果。 + +## 使用须知 + +使用绑核工具前手动执行npu-smi info -t topo,出现以下类似信息,说明环境支持绑核,否则请将环境HDK包升级到Ascend HDK 23.0.RC2及以上版本。 + + NPU0 NPU1 NPU2 NPU3 NPU4 NPU5 NPU6 NPU7 NPUx CPU Affinity + NPU0 X HCCS HCCS HCCS HCCS HCCS HCCS HCCS ... xx-xx + NPU1 HCCS X HCCS HCCS HCCS HCCS HCCS HCCS ... xx-xx + NPU2 HCCS HCCS X HCCS HCCS HCCS HCCS HCCS ... xx-xx + NPU3 HCCS HCCS HCCS X HCCS HCCS HCCS HCCS ... xx-xx + NPU4 HCCS HCCS HCCS HCCS X HCCS HCCS HCCS ... xx-xx + NPU5 HCCS HCCS HCCS HCCS HCCS X HCCS HCCS ... xx-xx + NPU6 HCCS HCCS HCCS HCCS HCCS HCCS X HCCS ... xx-xx + NPU7 HCCS HCCS HCCS HCCS HCCS HCCS HCCS X ... xx-xx + NPUx ... ... ... ... ... ... ... ... ... ... + +## 使用方式 + +1.执行以下命令实施绑核: + + - 直接执行绑核命令 +```bash +python3 bind_core.py -app/--application="inferenec/train cmd" +``` +该方式会自动拉起训练或推理任务,检测任务进程,并实施绑核。 + + - 手动拉起训练或推理任务后再执行绑核 +```bash +python3 bind_core.py +``` +该方式会循环查找(循环5次,每次10s,若找不到进程,则直接退出)使用到NPU的任务进程,并实施绑核。 + +2.绑核运行过程的日志会保存到当前路径的bind_core_时间戳.log。 + +3.如果推理或训练进程拉起后需要一定时间预处理,才会真正执行任务,可在执行绑核命令时设置-t/--time参数(单位秒),绑核工具会在延迟配置的时间后,再实施绑核动作。例如:python3 bind_core.py -app="cmd" -t=10,配置后工具会在10秒后执行绑核操作。 \ No newline at end of file diff --git a/profiler/affinity_cpu_bind_review/bind_core.py b/profiler/affinity_cpu_bind_review/bind_core.py new file mode 100644 index 0000000000000000000000000000000000000000..7f27e924238861be8f9041f8e9c93850d5834059 --- /dev/null +++ b/profiler/affinity_cpu_bind_review/bind_core.py @@ -0,0 +1,213 @@ +import subprocess +import argparse +import os +import time +import logging +from datetime import datetime +from datetime import timezone + + +class PathManager: + DATA_FILE_AUTHORITY = 0o640 + + @classmethod + def create_file_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to create file: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.close(os.open(path, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY)) + except Exception as err: + raise RuntimeError(msg) from err + + +class BindCoreManager(): + DEFAULT_FIND_RUNNING_PID_TIMES = 5 + + def __init__(self): + self.npu_id_list = [] + self.running_pid_on_npu = {} + self.find_running_pid_times = self.DEFAULT_FIND_RUNNING_PID_TIMES + self.npu_affinity_cpu_dict = {} + self.log_file = '' + self._init_log_file() + + + def _init_log_file(self): + now_time = datetime.now(tz=timezone.utc) + time_stamp = str(now_time.year) + '_' + \ + str(now_time.month) + '_' + \ + str(now_time.day) + '_' + \ + str(now_time.hour) + '_' + \ + str(now_time.minute) + '_' + \ + str(now_time.second) + log_file_name = 'bind_core_' + time_stamp + '.log' + msg = f"Failed to create file: {log_file_name}" + try: + PathManager.create_file_safety(os.path.join(os.getcwd(), log_file_name)) + except RuntimeError as err: + raise RuntimeError(msg) from err + self.log_file = log_file_name + logging.basicConfig(filename=self.log_file, + level=logging.INFO, + filemode='w', + format='%(asctime)s-%(name)s-%(levelname)s-%(message)s') + + def _get_all_npu_id(self) -> None: + get_npu_info_cmd = 'npu-smi info -l' + get_npu_info_process = subprocess.run(get_npu_info_cmd.split(), shell=False, capture_output=True) + get_npu_id_cmd = 'grep ID' + get_npu_id_process = subprocess.run(get_npu_id_cmd.split(), shell=False, input=get_npu_info_process.stdout, capture_output=True) + res = get_npu_id_process.stdout.decode('utf-8').split() + for i in res: + if i.isdigit(): + self.npu_id_list.append(int(i)) + logging.info(f'NPU total id list: {self.npu_id_list}') + + def _get_npu_affinity(self) -> bool: + cpu_num = os.cpu_count() + cpu_num_for_each_npu = cpu_num // len(self.npu_id_list) + get_npu_topo_cmd = 'npu-smi info -t topo' + p = subprocess.run(get_npu_topo_cmd.split(), shell=False, capture_output=True) + res = p.stdout.decode('utf-8').split() + if not res: + print('[ERROR] Failed to run get npu affinity info, please check if driver version support cmd npu-smi info -t topo') + return False + + index = 0 + for v in res: + if '-' in v: + affinity_cpus = [] + cpu_lists = v.split(',') + for cpu_list in cpu_lists: + cpus = cpu_list.split('-') + if len(cpus) != 2: + continue + if int(cpus[1]) - int(cpus[0]) == cpu_num_for_each_npu - 1: + cpus[1] = str(int(cpus[1]) + cpu_num_for_each_npu) + affinity_cpus.append(cpus[0] + '-' + cpus[1]) + if index < len(self.npu_id_list): + self.npu_affinity_cpu_dict[self.npu_id_list[index]] = ','.join(affinity_cpu for affinity_cpu in affinity_cpus) + index += 1 + else: + print('[ERROR] Get affinity_cpu_list for {} npus, more than real npu num: {}'.format(index + 1, len(self.npu_id_list))) + return False + + for k in self.npu_affinity_cpu_dict.keys(): + logging.info(f'Affinity CPU list {self.npu_affinity_cpu_dict[k]} for NPU {k}') + return True + + def get_running_pid_on_npu(self) -> bool: + no_running_pids_on_npu_msg = '[INFO] Now there is no running process on all NPUs, stop bind cores' + logging.info('Begin to find running process on all NPUs') + # get running process on NPUs + for times in range(self.find_running_pid_times): + running_pid_on_npu = {} + for npu_id in self.npu_id_list: + get_npu_pids_cmd = 'npu-smi info -t proc-mem -i {} -c 0'.format(npu_id) + get_npu_pids_process = subprocess.run(get_npu_pids_cmd.split(), shell=False, capture_output=True) + res = get_npu_pids_process.stdout.decode('utf-8').split() + pid_list = [] + for value in res: + if value.startswith('id:'): + pid = value.split(':')[1] + pid_list.append(pid) + if pid_list: + running_pid_on_npu[npu_id] = list(set(pid_list)) + + if len(self.running_pid_on_npu.keys()) == len(running_pid_on_npu.keys()) and running_pid_on_npu: + self.running_pid_on_npu = running_pid_on_npu + break + + self.running_pid_on_npu = running_pid_on_npu + time.sleep(5) + + # delete repeat pid + for npu_id in self.npu_id_list: + if npu_id not in self.running_pid_on_npu: + continue + pids_on_npu = self.running_pid_on_npu[npu_id] + for npu_id_with_pids, pids in self.running_pid_on_npu.items(): + if npu_id == npu_id_with_pids: + continue + pids_on_npu = list(set(pids_on_npu) - set(pids)) + self.running_pid_on_npu[npu_id] = pids_on_npu + + if_running_process = False + for npu_id, pids in self.running_pid_on_npu.items(): + if not pids: + logging.info(f'There is no running process on NPU {npu_id}') + else: + logging.info(f'Succeed to find running process {pids} on NPU {npu_id}') + if_running_process = True + if not if_running_process: + print(no_running_pids_on_npu_msg) + return if_running_process + + def get_npu_info(self) -> bool: + try: + self._get_all_npu_id() + if not self._get_npu_affinity(): + return False + except subprocess.CalledProcessError: + return False + return True + + def run_bind_core(self): + if not self.running_pid_on_npu: + return + for npu, pid_list in self.running_pid_on_npu.items(): + if npu not in self.npu_affinity_cpu_dict.keys(): + logging.warning(f'Cannot find affinity cpu for npu: {npu}') + continue + affinity_cpu = self.npu_affinity_cpu_dict.get(npu) + for pid in pid_list: + try: + logging.info(f'Begin to bind cores for process {pid} on NPU {npu}') + set_affinity_cpu_cmd = 'taskset -pc {} {}'.format(affinity_cpu, pid) + p = subprocess.run(set_affinity_cpu_cmd.split(), shell=False, capture_output=True) + logging.info(p.stdout.decode('utf-8')) + except subprocess.CalledProcessError: + print('[ERROR] Failed to bind process {} on NPU {} with cpu cores list {}'.format(pid, npu, affinity_cpu)) + + logging.info(f'Succeed to bind process {pid} on NPU {npu} with cpu cores list {affinity_cpu}') + + def args_parse(self): + parser = argparse.ArgumentParser(description='This is a affinity cpu core bind script.') + parser.add_argument('-t', '--time', type=int, metavar='', help='Wait time before bind cores that you want to set. The unit is \'s\'.') + parser.add_argument('-app', '--application', metavar='', nargs='+', help='Training or inference command that you want to run.') + args = parser.parse_args() + if args.application: + application_cmd = ' '.join(args.application) + self.launch_process(application_cmd) + time.sleep(2) + # if time is set, wait for setting time before bind cores + if args.time: + time.sleep(args.time) + + def launch_process(self, cmd: list): + logging.info(f'Start to execute cmd: {cmd}') + try: + subprocess.Popen(cmd.split(), shell=False) + except subprocess.CalledProcessError as e: + raise RuntimeError(f'Failed to run cmd: {cmd}') from e + + +if __name__ == '__main__': + print('[INFO] Begin to run bind-cores script...') + bind_core_manager = BindCoreManager() + bind_core_manager.args_parse() + + if not bind_core_manager.get_npu_info(): + print('[ERROR] Failed to get current npus info') + exit() + + if not bind_core_manager.get_running_pid_on_npu(): + exit() + bind_core_manager.run_bind_core() + print('[INFO] End to run bind-cores script, the log is saved in {}'.format(bind_core_manager.log_file)) + + diff --git a/profiler/cli_review/__init__.py b/profiler/cli_review/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eab13571c58756cc978ebc59479c86c0d1e85529 --- /dev/null +++ b/profiler/cli_review/__init__.py @@ -0,0 +1,4 @@ +from profiler.advisor.config.config import Config +from profiler.advisor.utils.utils import Timer + +Config().set_log_path(f"att_advisor_{Timer().strftime}.xlsx") diff --git a/profiler/cli_review/analyze_cli.py b/profiler/cli_review/analyze_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd7f1722517edc2e8177d3b88af06a6217cf5f2 --- /dev/null +++ b/profiler/cli_review/analyze_cli.py @@ -0,0 +1,136 @@ +import click +import sys +import os +import logging +from pathlib import Path + +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), "compare_tools")) +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), "cluster_analyse")) + +from profiler.advisor.utils.tools import CONTEXT_SETTINGS, ClickAliasedGroup +from profiler.advisor.common import constant +from profiler.advisor.utils.utils import debug_option +from profiler.advisor.interface.interface import Interface +from profiler.cluster_analyse.cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor + +logger = logging.getLogger() + + +def _analyze(dimensions, **kwargs): + result_list = [] + job_list = [] + if not Path(kwargs.get("profiling_path")).exists(): + print(f"[ERROR] Profiling_path is not existed. Invalid profiling path: {kwargs.get('profiling_path')}") + return + + def is_cluster(): + profiling_path = kwargs.get("profiling_path") + path_list = [os.path.join(profiling_path, dir_name) for dir_name in os.listdir(profiling_path)] + ascend_pt_dirs = [path for path in path_list if os.path.isdir(path) and path.endswith("ascend_pt")] + data_processor = PytorchDataPreprocessor(ascend_pt_dirs) + data_map = data_processor.get_data_map() + return len(data_map) > 1 + + is_cluster = is_cluster() + + for dimension in dimensions: + if not is_cluster and dimension == "cluster": + continue + for scope in Interface.get_scope(dimension): + interface = Interface(**kwargs) + job_list.append((dimension, scope, interface)) + + for i, (dimension, scope, interface) in enumerate(job_list[::-1]): + result_list.append( + interface.get_result(dimension, scope, render_html=i == len(job_list) - 1, output_dict=False, **kwargs)) + + for result in result_list[::-1]: + if result and hasattr(result, "show"): + result.show() + break + + +@click.group(name="analyze", cls=ClickAliasedGroup) +def analyze_cli(**kwargs): + """Analyze profiling datasets and give performance optimization suggestion.""" + pass + + +@analyze_cli.command(context_settings=CONTEXT_SETTINGS, + name="all", + short_help='Analyze timeline, operators and graph.') +@click.option('--profiling_path', '-d', 'profiling_path', type=click.Path(), required=True, + help='Directory of profiling data') +@click.option('--benchmark_profiling_path', '-bp', 'benchmark_profiling_path', type=click.Path(), + help='Directory of benchmark profiling data, used for compare performance') +@click.option('--cann_version', '-cv', 'cann_version', + type=click.Choice(constant.SUPPORTED_CANN_VERSION, case_sensitive=False), + default=constant.DEFAULT_CANN_VERSION, + help='The CANN software version, which can be viewed by executing the following command: ' + '"cat /usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info"') +@click.option('--torch_version', '-tv', 'torch_version', + type=click.Choice(constant.SUPPORTED_TORCH_VERSION, case_sensitive=False), + default=constant.DEFAULT_TORCH_VERSION, + help='The runtime torch version, which can be detected by exec command "pip show torch"') +# @click.option('--is_inference', is_flag=True, help="Enable performance analysis of inference task") +@click.option("-pt", + "--profiling_type", + metavar="", + default=constant.ASCEND_PYTORCH_PROFILER, + required=False, + type=click.Choice(constant.SUPPORTED_PROFILING_TYPE), + help="enter the profiling type, selectable range ascend_pytorch_profiler, mslite ,msprof") +@debug_option +def analyze_all(**kwargs) -> None: + # 当前compare_tools必须输入两个profiling路径,att-advisor有等价功能支持输入一个Profiling路径,后续替换成对应实现 + if not kwargs.get("benchmark_profiling_path"): + kwargs["benchmark_profiling_path"] = kwargs.get("profiling_path") + try: + _analyze(Interface.all_dimension, **kwargs) + except RuntimeError as e: + print(f"[ERROR] {e}") + + +@analyze_cli.command(context_settings=CONTEXT_SETTINGS, + name="schedule", + short_help='Analyze timeline, operators and graph.') +@click.option('--profiling_path', '-d', 'profiling_path', type=click.Path(), required=True, + help='Directory of profiling data') +@click.option('--cann_version', '-cv', 'cann_version', + type=click.Choice(constant.SUPPORTED_CANN_VERSION, case_sensitive=False), + default=constant.DEFAULT_CANN_VERSION, + help='The CANN software version, which can be viewed by executing the following command: ' + '"cat /usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info"') +@click.option('--torch_version', '-tv', 'torch_version', + type=click.Choice(constant.SUPPORTED_TORCH_VERSION, case_sensitive=False), + default=constant.DEFAULT_TORCH_VERSION, + help='The runtime torch version, which can be detected by exec command "pip show torch"') +@debug_option +def analyze_schedule(**kwargs) -> None: + _analyze(["schedule"], **kwargs) + + +@analyze_cli.command(context_settings=CONTEXT_SETTINGS, + name="computation", + short_help='Analyze timeline, operators and graph.') +@click.option('--profiling_path', '-d', 'profiling_path', type=click.Path(), required=True, + help='Directory of profiling data') +@click.option('--cann_version', '-cv', 'cann_version', + type=click.Choice(constant.SUPPORTED_CANN_VERSION, case_sensitive=False), + default=constant.DEFAULT_CANN_VERSION, + help='The CANN software version, which can be viewed by executing the following command: ' + '"cat /usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info"') +@click.option('--torch_version', '-tv', 'torch_version', + type=click.Choice(constant.SUPPORTED_TORCH_VERSION, case_sensitive=False), + default=constant.DEFAULT_TORCH_VERSION, + help='The runtime torch version, which can be detected by exec command "pip show torch"') +@click.option("-pt", + "--profiling_type", + metavar="", + default=constant.ASCEND_PYTORCH_PROFILER, + required=False, + type=click.Choice(constant.SUPPORTED_PROFILING_TYPE), + help="enter the profiling type, selectable range ascend_pytorch_profiler, mslite ,msprof") +@debug_option +def analyze_computation(**kwargs) -> None: + _analyze(["computation"], **kwargs) \ No newline at end of file diff --git a/profiler/cli_review/cluster_cli.py b/profiler/cli_review/cluster_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..93a4a638f270a7aeac853943895f707c7a0c0f28 --- /dev/null +++ b/profiler/cli_review/cluster_cli.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast +import click +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) +from profiler.advisor.utils.tools import CONTEXT_SETTINGS, ClickAliasedGroup +from profiler.advisor.utils.utils import debug_option +from profiler.prof_common.constant import Constant +from profiler.cluster_analyse.cluster_analysis import ALL_FEATURE_LIST +from profiler.cluster_analyse.cluster_analysis import cluster_analysis_main + + +context_settings = dict(Constant.CONTEXT_SETTINGS) +context_settings['ignore_unknown_options'] = True + + +@click.command(context_settings=context_settings, name="cluster", + short_help='Analyze cluster data to locate slow nodes and slow links.') +@click.option('--profiling_path', '-d', type=click.Path(), required=True, + help='path of the profiling data') +@click.option('--mode', '-m', type=click.Choice(ALL_FEATURE_LIST), default='all') +@click.argument('args', nargs=-1) +def cluster_cli(profiling_path, mode, args) -> None: + required_args = ('-d', profiling_path, '-m', mode) + cluster_analysis_main(required_args + args) diff --git a/profiler/cli_review/compare_cli.py b/profiler/cli_review/compare_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..19faeb4aa811b47b690bd7583205c920aa43bc2d --- /dev/null +++ b/profiler/cli_review/compare_cli.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast +import click +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) + +from profiler.prof_common.constant import Constant +from profiler.prof_common.analyze_dict import AnalyzeDict +from profiler.compare_tools.compare_backend.comparison_generator import ComparisonGenerator + +@click.command(context_settings=Constant.CONTEXT_SETTINGS, name="compare", + short_help='Compare the performance differences between GPUs and NPUs.') +@click.option('--profiling_path', '-d', 'base_profiling_path', type=click.Path(), required=True, + help='path of the profiling data') +@click.option('--benchmark_profiling_path', '-bp', 'comparison_profiling_path', type=click.Path(), required=True) +@click.option('--enable_profiling_compare', is_flag=True) +@click.option('--enable_operator_compare', is_flag=True) +@click.option('--enable_memory_compare', is_flag=True) +@click.option('--enable_communication_compare', is_flag=True) +@click.option('--output_path', '-o', 'output_path', type=click.Path()) +@click.option('--max_kernel_num', 'max_kernel_num', type=int, help="The number of kernels per torch op is limited.") +@click.option('--op_name_map', type=ast.literal_eval, default='{}', + help="The mapping of operator names equivalent to GPUs and NPUs in the form of dictionaries.", + required=False) +@click.option('--use_input_shape', is_flag=True) +@click.option('--gpu_flow_cat', type=str, default='', help="Identifier of the GPU connection.") +def compare_cli(**kwargs) -> None: + args = AnalyzeDict(kwargs) + try: + ComparisonGenerator(args).run() + except RuntimeError as e: + print(f"[ERROR] {e}") diff --git a/profiler/cli_review/complete_cli.py b/profiler/cli_review/complete_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4efd0af90daa84b7ae5c3a0b2462dc52873da5 --- /dev/null +++ b/profiler/cli_review/complete_cli.py @@ -0,0 +1,29 @@ +import click + +from profiler.advisor.utils.tools import CONTEXT_SETTINGS + + +@click.command(context_settings=CONTEXT_SETTINGS, + short_help='Auto complete ma-advisor command in terminal, support "bash(default)/zsh/fish".') +@click.argument('shell_type', nargs=1, default="Bash", type=click.Choice(["Bash", "Zsh", "Fish"], case_sensitive=False)) +def auto_complete_cli(shell_type): + """ + Auto complete ma-advisor command in terminal. + + Example: + + \b + # print bash auto complete command to terminal + msprof-analyze auto-completion Bash + """ + click.echo("Tips: please paste following shell command to your terminal to activate auto completion.\n") + if shell_type.lower() == "bash": + bash_str = 'eval "$(_MSPROF_ANALYZE_COMPLETE=bash_source msprof-analyze)"' + elif shell_type.lower() == "zsh": + bash_str = 'eval "$(_MSPROF_ANALYZE_COMPLETE=zsh_source msprof-analyze)"' + elif shell_type.lower() == "fish": + bash_str = 'eval (env _MSPROF_ANALYZE_COMPLETE=fish_source msprof-analyze)' + else: + click.echo(f'Unsupported shell type {shell_type}.') + return + click.echo(f'{bash_str}\n') diff --git a/profiler/cli_review/entrance.py b/profiler/cli_review/entrance.py new file mode 100644 index 0000000000000000000000000000000000000000..a260553031ecfc904ae8411d944037bdb2f101ab --- /dev/null +++ b/profiler/cli_review/entrance.py @@ -0,0 +1,67 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import click + +from profiler.cli.analyze_cli import analyze_cli +from profiler.cli.complete_cli import auto_complete_cli +from profiler.cli.compare_cli import compare_cli +from profiler.cli.cluster_cli import cluster_cli +from profiler.advisor.version import print_version_callback, cli_version + +logger = logging.getLogger() +CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help'], + max_content_width=160) + +COMMAND_PRIORITY = { + "advisor": 1, + "compare": 2, + "cluster": 3, + "auto-completion": 4 +} + + +class SpecialHelpOrder(click.Group): + + def __init__(self, *args, **kwargs): + super(SpecialHelpOrder, self).__init__(*args, **kwargs) + + def list_commands_for_help(self, ctx): + """ + reorder the list of commands when listing the help + """ + commands = super(SpecialHelpOrder, self).list_commands(ctx) + return [item[1] for item in sorted((COMMAND_PRIORITY.get(command, float('INF')), + command) for command in commands)] + + def get_help(self, ctx): + self.list_commands = self.list_commands_for_help + return super(SpecialHelpOrder, self).get_help(ctx) + + +@click.group(context_settings=CONTEXT_SETTINGS, cls=SpecialHelpOrder) +@click.option('--version', '-V', '-v', is_flag=True, + callback=print_version_callback, expose_value=False, + is_eager=True, help=cli_version()) +def msprof_analyze_cli(**kwargs): + pass + + +msprof_analyze_cli.add_command(analyze_cli, name="advisor") +msprof_analyze_cli.add_command(compare_cli, name="compare") +msprof_analyze_cli.add_command(cluster_cli, name="cluster") +msprof_analyze_cli.add_command(auto_complete_cli, name="auto-completion") diff --git a/profiler/cluster_analyse_review/README.md b/profiler/cluster_analyse_review/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f0434ab1327d61b679a26767fdce3f28d1cd1f32 --- /dev/null +++ b/profiler/cluster_analyse_review/README.md @@ -0,0 +1,180 @@ +# 集群分析工具 +cluster_analyse(集群分析工具)是在集群场景下,通过此工具来进行集群数据的分析,当前主要对基于通信域的迭代内耗时分析、通信时间分析以及通信矩阵分析为主, 从而定位慢卡、慢节点以及慢链路问题。 + +## 性能数据采集 +当前集群调优工具主要支持Ascend PyTorch Profiler采集方式下的集群数据。采集方式参考:[Profiling数据采集](https://gitee.com/ascend/att/tree/master/profiler),此工具只需要通过Ascend PyTorch Porfiler工具采集NPU的性能数据即可。 + +我们要求至少是L1级别的数据。 +```python +experimental_config = torch_npu.profiler._ExperimentalConfig( + profiler_level=torch_npu.profiler.ProfilerLevel.Level1 +) +``` +### 确认数据是否可用 + +打开采集到的某张卡数据(*ascend_pt结尾的文件夹),可用的数据应该具备: + +- ./profiler_info_x.json, +- ./ASCEND_PROFILER_OUTPUT/step_trace_time.csv, +- ./ASCEND_PROFILER_OUTPUT/trace_view.json, +- ./ASCEND_PROFILER_OUTPUT/kernel_details.csv, +- ./ASCEND_PROFILER_OUTPUT/communication.json, +- ./ASCEND_PROFILER_OUTPUT/communication_matrix.json + +或者具备: + +- analysis.db +- ascend_pytorch_profiler_{rank_id}.db + +以上csv、json文件与db文件只能存在一类,否则集群分析工具解析异常。 + +确认这几个文件生成后,继续下面的集群分析。 + +## 数据汇聚与解析 + +### 操作步骤 + +1. 参见《[性能工具](../README.md)》完成工具安装。建议安装最新版本。 + +2. 将所有卡的数据拷贝并汇集到一个目录下,在本目录下运行以下命令即可生成cluster_analysis_output文件夹。 + + ```bash + msprof-analyze cluster -d {cluster profiling data path} -m {mode} + ``` + + 或 + + ```bash + python3 cluster_analysis.py -d {cluster profiling data path} -m {mode} + ``` + + 参数说明: + + | 参数名 | 说明 | 是否必选 | + | --------------------- | ------------------------------------------------------------ | -------- | + | --collection_path或-d | 性能数据汇集目录,运行分析脚本之后会在该目录下自动创建cluster_analysis_output文件夹,保存分析数据。 | 是 | + | --mode或-m | 数据解析模式,取值详见“**--mode参数说明**”表。 | 否 | + | --parallel_mode | 设置收集多卡、多节点db数据时的并发方式。取值为concurrent(使用concurrent.feature进程池实现并发)。
**只有-m配置cann_api_sum、compute_op_sum、hccl_sum、mstx_sum时可配置此参数。** | 否 | + | --export_type | 设置导出的数据形式。取值为db(.db格式文件)和notebook(Jupyter Notebook文件),默认值为db。
**只有-m配置cann_api_sum、compute_op_sum、hccl_sum、mstx_sum时可配置此参数。** | 否 | + | --rank_list | 对特定Rank上的数据进行统计,默认值为all(表示对所有Rank进行统计),须根据实际卡的Rank ID配置。应配置为大于等于0的整数,若所配置的值大于实际训练所运行的卡的Rank ID,则仅解析合法的RankID的数据,比如当前环境Rank ID为0到7,实际训练运行0到3卡,此时若配置Rank ID为0, 3, 4或不存在的10等其他值,则仅解析0和3。配置示例:--rank_list 0, 1, 2。
**只有-m配置cann_api_sum、compute_op_sum、hccl_sum、mstx_sum时可配置此参数。** | 否 | + | --top_num | 设置TopN耗时的通信算子的数量,默认值为15,配置示例:--top_num 20。
**只有-m配置hccl_sum时可配置此参数。** | 否 | + + --mode参数说明: + + | 参数名 | 说明 | 是否必选 | + | -------------------- | ------------------------------------------------------------ | -------- | + | communication_matrix | 解析通信矩阵数据。 | 否 | + | communication_time | 解析通信耗时数据。 | 否 | + | all | 同时解析通信矩阵communication_matrix和通信耗时数据communication_time,--mode参数默认值为all。 | 否 | + | cann_api_sum | 集群API性能数据汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/CannApiSum目录下输出交付件stats.ipynb。 | 否 | + | compute_op_sum | 集群场景性能数据的device运行算子信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/ComputeOpSum目录下输出交付件stats.ipynb。 | 否 | + | hccl_sum | 集合通信算子耗时分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/HcclSum目录下输出交付件stats.ipynb。 | 否 | + | mstx_sum | 集群场景mstx打点信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/MstxSum目录下输出交付件stats.ipynb。 | 否 | + + --parallel_mode参数示例如下: + + ```bash + msprof-analyze cluster -d {cluster profiling data path} -m cann_api_sum --parallel_mode concurrent + ``` + + 或 + + ```bash + python3 cluster_analysis.py -d {cluster profiling data path} -m cann_api_sum --parallel_mode concurrent + ``` + + +### 交付件 + +集群分析工具的交付件通过Ascend Insight工具展示,详见《[MindStudio Ascend Insight用户指南](https://www.hiascend.com/document/detail/zh/mindstudio/70RC1/GUI-baseddevelopmenttool/msascendinsightug/AscendInsight_0002.html)》。 + +#### cluster_step_trace_time.csv + +数据解析模式为communication_matrix、communication_time或all时均生成。 + +A列: Step数,是采集性能数据时设置的,一般来说集群性能数据采集一个step足够,如果采集多个step,需要先筛选一下。 + +B列: Type,主要分两种,rank和stage, 和后面的index强相关,可以理解为一个是单卡rank,一个是rank group(pp 并行的stage),如果type为stage,则后面D-K列信息为rank group下的最大值。 + +C列:Index,与type相关,表示卡号。 + +D列:Computing, 此列统计计算时间。 + +E列:Communication(Not Overlapped),此列统计未被掩盖的通信耗时。 + +F列:Overlapped,统计计算与通信重叠的耗时。 + +G列:Communication,通信时间的全部耗时。 + +H列:Free,空闲时间,只device侧既不在通信也不在计算的耗时,可能在做sdma拷贝或者空等。 + +I列:Stage时间,I、J、K列属于pp并行时有效的数值,stage时间代表除recieve算子时间外的时间。 + +J列:Bubble时间,指receive时间的总和。 + +K列:Communication(Not Overlapped and Exclude Receive)指剔除recieve算子外的并且不被掩盖的通信时间。 + +L列:Preparing,指迭代开始到首个计算或通信算子运行的时间。 + +**Tips**:先筛选B列type为stage, 看stage间是否有问题,再筛选B列type为rank,看rank是否有问题,根据以下几点排查。 + +* 根据Computing的时间差异判断是否有慢卡,或者有负载不均衡的现象。 + +* 根据Free统计是否有host bound或者分布不均现象。 + +* 根据Communication(Not Overlapped and Exclude Receive)时间判断是否通信耗时占比过大。 + +* 根据Bubble时间的占比和理论计算公式判断bubble设置是否合理,是否stage间有不均衡现象。 + +以上时间理论上都应该处于持平状态,即最大值小于最小值5%,否则就可能出现慢卡。 + +#### cluster_communication_matrix.json + +数据解析模式为communication_matrix或all时生成。 + +直接打开json(vscode或json查看器), 搜索"Total", 会有多个搜索结果,一般来说链路带宽信息的结构: + +```bash +{src_rank}-{dst_rank}: { + "Transport Type": "LOCAL", + "Transit Time(ms)": 0.02462, + "Transit Size(MB)": 16.777216, + "Bandwidth(GB/s)": 681.4466 +} +``` +**Tips**:可以根据rank互联的带宽以及链路类型,判断是否有慢链路的问题。 + +- "LOCAL"是片内拷贝,速率非常快,不需要考虑。 +- “HCCS”或“PCIE”是节点内片间拷贝,速度在18GB左右或以上比较正常。 +- “RDMA”是节点间拷贝,910A速度在12GB左右或以上。 + +#### cluster_communication.json + +数据解析模式为communication_time或all时生成。 + +主要为通信耗时数据。 + +#### cluster_analysis.db + +解析analysis.db或ascend_pytorch_profiler_{rank_id}.db生成的交付件,根据数据解析模式不同而解析不同的数据,可以使用Ascend Insight工具展示。 + +#### stats.ipynb + +- 数据解析模式为cann_api_sum时生成,保存在cluster_analysis_output/CannApiSum目录下。 + + 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群API耗时信息。 + +- 数据解析模式为compute_op_sum时生成,保存在cluster_analysis_output/ComputeOpSum目录下。 + + 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群计算算子耗时分析(将集群所有计算算子进行汇总并以图表展示),集群Rank计算算子耗时分析(将每个Rank的计算算子进行各自汇总)。 + +- 数据解析模式为hccl_sum时生成,保存在cluster_analysis_output/HcclSum目录下。 + + 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群通信算子耗时分析(将集群所有通信算子进行汇总并以图表展示),集群Rank通信算子耗时分析(将每个Rank的通信算子进行各自汇总)、Top通信算子信息展示。 + +- 数据解析模式为mstx_sum时生成,保存在cluster_analysis_output/MstxSum目录下。 + + 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群场景mstx打点信息,分为框架侧、CANN侧和Device侧三部分的打点信息。 + + + diff --git a/profiler/cluster_analyse_review/__init__.py b/profiler/cluster_analyse_review/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/analysis/__init__.py b/profiler/cluster_analyse_review/analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/analysis/analysis_facade.py b/profiler/cluster_analyse_review/analysis/analysis_facade.py new file mode 100644 index 0000000000000000000000000000000000000000..435d77b21bff423b207bf050ea660a1738f0fe5f --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/analysis_facade.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from multiprocessing import Process + +from analysis.communication_analysis import CommunicationAnalysis +from analysis.comm_matrix_analysis import CommMatrixAnalysis +from analysis.step_trace_time_analysis import StepTraceTimeAnalysis +from analysis.host_info_analysis import HostInfoAnalysis +from common_func.context import Context +from common_func.constant import Constant + +class AnalysisFacade: + default_module = {CommunicationAnalysis, StepTraceTimeAnalysis, CommMatrixAnalysis, HostInfoAnalysis} + + def __init__(self, params: dict): + self.params = params + + def cluster_analyze(self): + # 多个profiler用多进程处理 + process_list = [] + for analysis in self.default_module: + process = Process(target=analysis(self.params).run) + process.start() + process_list.append(process) + + for process in process_list: + process.join() + + def recipe_analyze(self): + HostInfoAnalysis(self.params).run() + print("[INFO] Recipe analysis launched.") + try: + with Context.create_context(self.params.get(Constant.PARALLEL_MODE)) as context: + with self.params.get(Constant.RECIPE_CLASS)(self.params) as recipe: + recipe.run(context) + except Exception as e: + print("[ERROR] Recipe analysis launched failed, %s." % str(e)) diff --git a/profiler/cluster_analyse_review/analysis/base_analysis.py b/profiler/cluster_analyse_review/analysis/base_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..a902e28ec052aafcd58493aff586108dd61552fd --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/base_analysis.py @@ -0,0 +1,251 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import traceback +import shutil +import pandas as pd +from abc import abstractmethod + +from common_func.constant import Constant +from common_func.file_manager import FileManager +from common_func.db_manager import DBManager +from common_func.utils import convert_unit +from cluster_utils.data_transfer_adapter import DataTransferAdapter + + +class BaseAnalysis: + + def __init__(self, param: dict): + self.collection_path = param.get(Constant.COLLECTION_PATH) + self.data_map = param.get(Constant.DATA_MAP) + self.data_type = param.get(Constant.DATA_TYPE) + self.communication_ops = [] + self.collective_group_dict = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP) + self.comm_ops_struct = {} + self.adapter = DataTransferAdapter() + + @staticmethod + def compute_ratio(dividend: float, divisor: float): + if abs(divisor) < Constant.EPS: + return 0 + else: + return round(dividend / divisor, 4) + + @staticmethod + def check_add_op(op_name: str): + """ + 兼容2个版本,判断是否需要将此算子信息相加 + """ + stat_list = ["middle", "top", "bottom", "total"] + total = "total" + for stat_name in stat_list: + if stat_name in op_name: + if stat_name != total: + return False + return True + + @abstractmethod + def run(self): + pass + + def dump_data(self): + if not self.comm_ops_struct: + print("[WARNING] There is no final comm ops data generated") + return + if self.data_type == Constant.TEXT: + self.dump_json() + else: + self.dump_db() + + @abstractmethod + def dump_db(self): + pass + + def dump_json(self): + output_comm_data = {} + for key in self.comm_ops_struct: + output_comm_data[str(key)] = self.comm_ops_struct.get(key) + FileManager.create_json_file(self.collection_path, output_comm_data, self.SAVED_JSON) + + def split_op_by_group(self): + for single_op in self.communication_ops: + if single_op.get(Constant.COMM_OP_TYPE) == Constant.P2P: + rank_tup = Constant.P2P + else: + rank_tup = tuple(self.collective_group_dict.get(single_op.get(Constant.GROUP_NAME), [])) + rank_id = single_op.get(Constant.RANK_ID, 'N/A') + step_id = single_op.get(Constant.STEP_ID, 'N/A') + op_name = single_op.get(Constant.COMM_OP_NAME, 'N/A') + op_info = single_op.get(Constant.COMM_OP_INFO) + self.comm_ops_struct.setdefault(rank_tup, {}).setdefault(step_id, {}).\ + setdefault(op_name, {}).setdefault(rank_id, op_info) + + def combine_ops_total_info(self): + for rank_tup, group_dict in self.comm_ops_struct.items(): + for step_id, communication_ops in group_dict.items(): + self.compute_total_info(communication_ops) + + +class BaseRecipeAnalysis: + + UNIT = "Us" + DB_UNIT = "Ns" + + RANK_LIST = "rank_list" + + def __init__(self, params): + self._params = params + self._collection_dir = params.get(Constant.COLLECTION_PATH, "") + self._data_map = params.get(Constant.DATA_MAP, {}) + self._recipe_name = params.get(Constant.RECIPE_NAME, "") + self._mode = params.get(Constant.PARALLEL_MODE, "") + self._export_type = params.get(Constant.EXPORT_TYPE, "") + self._output_dir = None + self._rank_list = params.get(self.RANK_LIST, 'all') + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._params is not None and exc_type is not None: + print(f"[ERROR] Failed to exit analysis: {exc_val}") + traceback.print_exc(file=sys.stdout) + + def run(self, context): + pass + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + def _get_rank_db(self): + invalid_rank_id = [] + if self._rank_list == 'all': + rank_ids = list(self._data_map.keys()) + else: + rank_ids = [] + for rank_id in self._rank_list: + if rank_id in self._data_map.keys(): + rank_ids.append(rank_id) + else: + invalid_rank_id.append(str(rank_id)) + db_paths = [] + for rank_id in rank_ids: + rank_path = self._data_map[rank_id] + db_path = os.path.join(rank_path, Constant.SINGLE_OUTPUT, f"ascend_pytorch_profiler_{rank_id}.db") + if os.path.exists(db_path): + db_paths.append((rank_id, db_path)) + else: + print(f"[WARNING] DB file not found, rank id: {rank_id}, db path: {db_path}.") + if invalid_rank_id: + print(f"[WARNING] Invalid Rank id : [{','.join(invalid_rank_id)}].") + return db_paths + + def get_mode(self): + return self._mode + + def get_recipe_name(self): + return self._recipe_name + + def dump_data(self, data, file_name, table_name=None, index=True): + output_path = os.path.join(self._collection_dir, Constant.CLUSTER_ANALYSIS_OUTPUT) + if table_name: + result_db = os.path.join(output_path, file_name) + conn, cursor = DBManager.create_connect_db(result_db) + if isinstance(data, pd.DataFrame): + data.to_sql(table_name, conn, if_exists='replace', index=True) + else: + print(f"[ERROR] Unknown dump data type: {type(data)}") + DBManager.destroy_db_connect(conn, cursor) + else: + result_csv = os.path.join(output_path, file_name) + if isinstance(data, pd.DataFrame): + data = convert_unit(data, self.DB_UNIT, self.UNIT) + data.to_csv(result_csv, index=index) + else: + print(f"[ERROR] Unknown dump data type: {type(data)}") + + def _create_output_dir_name(self, name): + i = 1 + while os.path.exists(f"{name}-{i}"): + i += 1 + return f"{name}-{i}" + + def _create_unique_output_dir(self): + output_dir = os.path.join(self._collection_dir, Constant.CLUSTER_ANALYSIS_OUTPUT, self._recipe_name) + + if os.path.exists(output_dir): + return self._create_output_dir_name(output_dir) + return output_dir + + def _get_output_dir(self): + if self._output_dir is None: + self._output_dir = self._create_unique_output_dir() + os.makedirs(self._output_dir) + return self._output_dir + + def create_notebook(self, filename, notebook_template_dir=None, replace_dict=None): + if notebook_template_dir is None: + template_path = os.path.dirname(__file__) + else: + template_path = notebook_template_dir + output_path = os.path.join(self._get_output_dir(), filename) + template_file = os.path.join(template_path, self.base_dir, filename) + if replace_dict is None: + shutil.copy(template_file, output_path) + else: + with open(template_file, 'r') as f: + template_content = f.read() + for key, value in replace_dict.items(): + template_content = template_content.replace(str(key), str(value)) + with open(output_path, 'w') as f: + f.write(template_content) + print(f"[INFO] Notebook export path is: {self._get_output_dir()}") + + def add_helper_file(self, helper_file): + helper_output_path = os.path.join(self._get_output_dir(), helper_file) + helper_file_path = os.path.join(os.path.dirname(__file__), helper_file) + + if helper_file_path is not None: + shutil.copy(helper_file_path, helper_output_path) + + @staticmethod + def _filter_data(mapper_data): + return [(rank, data) for rank, data in mapper_data if data is not None and len(data) != 0] + + @classmethod + def add_parser_argument(cls, parser): + parser.add_argument("--rank_list", type=str, help="Rank id list", default='all') + + @classmethod + def parse_argument(cls, args_parsed) -> dict: + if args_parsed.rank_list == 'all': + return { + cls.RANK_LIST: 'all' + } + else: + rank_str_list = args_parsed.rank_list.split(",") + rank_list = [int(rank) for rank in rank_str_list if rank.isdigit()] + return { + cls.RANK_LIST: rank_list + } + + @classmethod + def get_extra_argument(cls, params) -> dict: + return { + cls.RANK_LIST: params.get(cls.RANK_LIST, "all") + } diff --git a/profiler/cluster_analyse_review/analysis/cann_api_sum/__init__.py b/profiler/cluster_analyse_review/analysis/cann_api_sum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/cann_api_sum/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/analysis/cann_api_sum/cann_api_sum.py b/profiler/cluster_analyse_review/analysis/cann_api_sum/cann_api_sum.py new file mode 100644 index 0000000000000000000000000000000000000000..db37b004b150eaa65b9c9cd4e12f1f5bdc0836e9 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/cann_api_sum/cann_api_sum.py @@ -0,0 +1,108 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd + +from analysis.base_analysis import BaseRecipeAnalysis +from common_func.constant import Constant +from common_func.utils import stdev +from cluster_statistics_export.cann_api_sum_export import CannApiSumExport + + +class CannApiSum(BaseRecipeAnalysis): + + def __init__(self, params): + super().__init__(params) + print("[INFO] CannApiSum init.") + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + @staticmethod + def _mapper_func(data_map, analysis_class): + df = CannApiSumExport(data_map[1], analysis_class).read_export_db() + + if df is None or df.empty: + print(f"[WARNING] There is no stats data in {data_map[1]}.") + return None + return data_map[0], df + + def mapper_func(self, context): + return context.wait( + context.map( + self._mapper_func, + self._get_rank_db(), + analysis_class=self._recipe_name + ) + ) + + def reducer_func(self, mapper_res): + stats_rank_data = self._filter_data(mapper_res) + if not stats_rank_data: + print("[ERROR] Mapper data is None.") + return + stats_rank_data = [df.assign(rank=rank) for rank, df in stats_rank_data] + stats_rank_data = pd.concat(stats_rank_data) + stats_data = self._aggregate_stats(stats_rank_data) + if self._export_type == "db": + self.dump_data(stats_rank_data, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, "CannApiSumRank") + self.dump_data(stats_data, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, "CannApiSum") + elif self._export_type == "notebook": + self.dump_data(stats_rank_data, os.path.join(self._get_output_dir(), "rank_stats.csv"), index=False) + self.dump_data(stats_data, os.path.join(self._get_output_dir(), "all_stats.csv")) + self.save_notebook() + else: + print("[ERROR] Unknown export type.") + + def run(self, context): + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + + @staticmethod + def _aggregate_stats(stats_res): + grouped = stats_res.groupby("name") + res = {} + total_time = grouped["totalTimeNs"].sum() + res["timeRatio"] = total_time / total_time.sum() * 100.0 + res["totalTimeNs"] = total_time + res["totalCount"] = grouped["totalCount"].sum() + res["averageNs"] = res["totalTimeNs"] / res["totalCount"] + res["Q1Ns"] = grouped["Q1Ns"].min() + res["medNs"] = grouped["medNs"].median() + res["Q3Ns"] = grouped["Q3Ns"].max() + res["minNs"] = grouped["minNs"].min() + res["maxNs"] = grouped["maxNs"].max() + res["stdev"] = grouped.apply(lambda x: stdev(x, res)) + min_value = grouped["minNs"].min() + res["minRank"] = grouped.apply( + lambda x: ", ".join( + x.loc[x["minNs"] == min_value.loc[x.name], "rank"].astype(str) + ) + ) + max_value = grouped["maxNs"].max() + res["maxRank"] = grouped.apply( + lambda x: ", ".join( + x.loc[x["maxNs"] == max_value.loc[x.name], "rank"].astype(str) + ) + ) + res = pd.concat(res.values(), axis=1, keys=res.keys()).round(1) + res.sort_values(by="totalTimeNs", ascending=False, inplace=True) + return res + + def save_notebook(self): + self.create_notebook("stats.ipynb") + self.add_helper_file("cluster_display.py") diff --git a/profiler/cluster_analyse_review/analysis/cann_api_sum/stats.ipynb b/profiler/cluster_analyse_review/analysis/cann_api_sum/stats.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c97f039c5a01a6e7cce2968d569d79e137e76f8c --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/cann_api_sum/stats.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CANN_API_SUM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import plotly.offline as pyo\n", + "\n", + "from IPython.display import display, HTML\n", + "\n", + "import cluster_display\n", + "\n", + "display(HTML(\"\"))\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_rows', None)\n", + "pyo.init_notebook_mode()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## 集群场景CANN层API统计分析\n", + "该分析脚本展示了集群场景的统计数据分析结果。需要注意以下几点:\n", + "1. 所有的时间信息单位是微秒(us);\n", + "2. Q1表示单个API耗时的25%分位数,最终结果取自所有卡的Q1值中最小值;\n", + "3. Q3表示单个API耗时的75%分位数,最终结果取自所有卡的Q3值中最大值;\n", + "4. 'minRank'展示了API最小耗时所在卡;\n", + "5. 'maxRank'展示了API最大耗时所在卡。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"all_stats.csv\")\n", + "display(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster_display.display_box(df, xaxis_title=\"name\", yaxis_title=\"duration (ns)\")\n", + "cluster_display.display_stats_scatter(df, xaxis_title=\"name\", yaxis_title=\"duration (ns)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "per_rank_df = pd.read_csv(\"rank_stats.csv\")\n", + "cluster_display.display_stats_per_operation(per_rank_df, xaxis_title='rank', yaxis_title='duration (ns)')" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse_review/analysis/cluster_display.py b/profiler/cluster_analyse_review/analysis/cluster_display.py new file mode 100644 index 0000000000000000000000000000000000000000..8fc6040ccafae2d069e2e6e394941c7aff83a452 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/cluster_display.py @@ -0,0 +1,239 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from IPython.display import display, HTML +from ipywidgets import Dropdown, fixed, interact + + +def get_stats_cols(df): + cols = df.columns.tolist() + q1 = "Q1(Us)" if "Q1(Us)" in cols else "Q1~" + q3 = "Q3(Us)" if "Q3(Us)" in cols else "Q3~" + med = "med(Us)" if "med(Us)" in cols else "med~" + std = "stdev" if "stdev" in cols else "stdev~" + return q1, q3, med, std + + +def display_box(df, x=None, **layout_args): + if x is None: + x = df.columns[0] + q1, q3, med, std = get_stats_cols(df) + fig = go.Figure() + fig.add_trace( + go.Box( + x=df[x], + q1=df[q1], + median=df[med], + q3=df[q3], + sd=df[std], + lowerfence=df["minRank"], + upperfence=df["maxRank"] + ) + ) + fig.update_layout(**layout_args) + fig.show() + + +def display_stats_scatter(df, x=None, **layout_args): + if x is None: + x = df.columns[0] + q1, q3, med, _ = get_stats_cols(df) + fig = go.Figure() + col_names = [q1, med, q3, "minRank", "maxRank"] + for name in col_names: + fig.add_trace( + go.Scatter( + x=df[x], + y=df[name], + name=name + ) + ) + fig.update_layout(**layout_args) + fig.show() + + +def display_table_per_rank(df): + if df.empty: + display(df) + return + + rank_groups = df.groupby("rank") + def display_table(name): + rank_df = rank_groups.get_group(name) + rank_df = rank_df.drop(columns=["rank"]) + display(rank_df) + + dropdown = Dropdown( + options=rank_groups.groups.keys(), + description="rank:", + disabled=False, + ) + interact( + display_table, + name=dropdown + ) + + +def display_stats_per_operation(df, x=None, box=True, scatter=True, table=True, **layout_args): + if df.empty: + display(df) + return + + if x is None: + x = df.columns[0] + + op_groups = df.groupby(x) + + def display_graphs(name): + op_df = op_groups.get_group(name) + if table: + display(op_df.reset_index(drop=True).set_index("rank")) + if box: + display_box(op_df, x=op_df["rank"], **layout_args) + if scatter: + display_stats_scatter(op_df, x=op_df["rank"], **layout_args) + + operations = list(op_groups.groups.keys()) + + if len(operations) > 1: + dropdown = Dropdown( + options=operations, + description="Operation:", + disabled=False, + value=operations[1] + ) + interact( + display_graphs, + name=dropdown + ) + dropdown.value = operations[0] + else: + display_graphs(operations[0]) + + +def display_duration_boxplots(figs, stats_df: pd.DataFrame, orientation="v", title=None, + x_title="Names", y_title="Time", legend_title="Legend"): + mean_ds = stats_df.get("Mean(Us)", None) + min_ds = stats_df.get("Min(Us)", None) + max_ds = stats_df.get("Max(Us)", None) + q1_ds = stats_df.get("Q1(Us)", None) + median_ds = stats_df.get('Median(Us)', None) + q3_ds = stats_df.get('Q3(Us)', None) + return display_boxplot(figs, stats_df.index, min_ds, q1_ds, median_ds, q3_ds, max_ds, mean_ds, + orientation=orientation, title=title, x_title=x_title, y_title=y_title, + legend_title=legend_title) + + +def display_boxplot(figs, x_axis, min_ds, q1_ds, median_ds, q3_ds, max_ds, mean_ds, orientation="v", + title=None, x_title=None, y_title="Time", legend_title="Legend"): + fig = go.Figure() + fig.add_trace( + go.Box( + x=x_axis, + lowerfence=min_ds, + q1=q1_ds, + median=median_ds, + q3=q3_ds, + upperfence=max_ds, + mean=mean_ds + ) + ) + fig.update_traces(orientation=orientation) + fig.update_layout( + xaxis_title=x_title, yaxis_title=y_title, legend_title=legend_title, + title=title, height=1024 + ) + fig.show() + if isinstance(figs, list): + figs.append(fig) + return fig + + +def display_graph(figs, x_axis, y_axes, title=None, + x_title=None, y_title=None, legend_title="Legend"): + data = None + if isinstance(y_axes, pd.DataFrame): + data = y_axes.set_index(x_axis) + elif isinstance(y_axes, dict): + data = pd.DataFrame(y_axes, index=x_axis) + elif isinstance(y_axes, pd.Series): + data = pd.DataFrame({"": y_axes}, index=x_axis) + elif isinstance(y_axes, np.ndarray): + data = pd.DataFrame({"": pd.Series(y_axes)}, index=x_axis) + else: + return + + fig = data.plot.line() + fig.update_layout( + title=title, xaxis_title=x_title, yaxis_title=y_title, legend_title=legend_title + ) + fig.show() + if isinstance(figs, list): + figs.append(fig) + return fig + + +def display_stats_per_rank_groups_combobox(rank_stats_gdf): + names = list(rank_stats_gdf.groups.keys()) + if len(names) > 1: + dropdown = Dropdown( + options=names, layout={"width": "max-content"}, value=names[1] + ) + interact( + __display_stats_per_rank_group, + selected=dropdown, + rank_stats_gdf=fixed(rank_stats_gdf) + ) + dropdown.value = names[0] + elif len(names) == 1: + __display_stats_per_rank_group(names[0], rank_stats_gdf) + else: + print("cluster_display func:input rank_stats_gdf groups is null so no need to display") + + +def __display_stats_per_rank_group(selected, rank_stats_gdf): + df = rank_stats_gdf.get_group(selected) + df = df.reset_index(drop=True) + df = df.set_index(df["Rank"]) + display(df) + + figs = [] + display_duration_boxplots(figs, df, x_title="Ranks") + display_graph( + figs, + df.index, + df[["Q1(Us)", "Median(Us)", "Q3(Us)"]], + title="50% of Distribution", + x_title="Ranks" + ) + + +def display_stats_optional_combobox(options, display_func, args, description="Option:"): + if len(options) > 1: + dropdown = Dropdown( + options=options, layout={"width": "max-content"}, value=options[1], + description=description + ) + interact( + display_func, + selected=dropdown, + args=fixed(args) + ) + dropdown.value = options[0] + elif len(options) == 1: + display_func(options[0], args) diff --git a/profiler/cluster_analyse_review/analysis/comm_matrix_analysis.py b/profiler/cluster_analyse_review/analysis/comm_matrix_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..8dc04471fe0a164fc859e51597d41028523f7a32 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/comm_matrix_analysis.py @@ -0,0 +1,106 @@ +import os +from collections import defaultdict + +from analysis.base_analysis import BaseAnalysis +from common_func.constant import Constant +from common_func.db_manager import DBManager + + +class CommMatrixAnalysis(BaseAnalysis): + SAVED_JSON = "cluster_communication_matrix.json" + COMMUNICATION_MATRIX_TABLE = "ClusterCommAnalyzerMatrix" + + def __init__(self, param: dict): + super().__init__(param) + self.communication_ops = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS) + + @staticmethod + def combine_link(link_info_dict: dict, single_link_dict: dict): + link_info_dict[Constant.TRANSPORT_TYPE] = single_link_dict.get(Constant.TRANSPORT_TYPE) + link_info_dict[Constant.OP_NAME] = single_link_dict.get(Constant.OP_NAME, '') + link_info_dict[Constant.TRANSIT_TIME_MS] += single_link_dict.get(Constant.TRANSIT_TIME_MS, 0) + link_info_dict[Constant.TRANSIT_SIZE_MB] += single_link_dict.get(Constant.TRANSIT_SIZE_MB, 0) + + def run(self): + if not self.communication_ops: + return + self.split_op_by_group() + self.combine_ops_total_info() + self.dump_data() + + def dump_db(self): + res_comm_matrix = self.adapter.transfer_matrix_from_json_to_db(self.comm_ops_struct) + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.COMMUNICATION_MATRIX_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + if res_comm_matrix: + res_matrix_value = [list(data.values()) for data in res_comm_matrix] + sql = "insert into {} values ({value})".format(self.COMMUNICATION_MATRIX_TABLE, + value="?," * (len(res_matrix_value[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res_matrix_value) + DBManager.destroy_db_connect(conn, cursor) + + def compute_total_info(self, step_dict: dict): + self.merge_same_links(step_dict) + self.combine_link_info(step_dict) + + def merge_same_links(self, step_dict: dict): + def process_link_key(): + for link_key in rank_dict: + if '-' not in link_key: + print(f"[WARNING] {op_name} has an invalid link key {link_key}!") + break + src_rank = link_key.split('-')[0] + dst_rank = link_key.split('-')[1] + if src_rank == dst_rank: + if src_rank not in project_local_global_rank_map: + project_local_global_rank_map[src_rank] = rank_id + elif project_local_global_rank_map.get(src_rank) != rank_id: + print(f"[WARNING] In the same communication group, local ranks projecting to global ranks " + f"repeat!") + self.combine_link(link_info[link_key], rank_dict[link_key]) + + def convert_local_to_global_rank(): + tmp_link = {} + for link_key, link_dict in link_info.items(): + src_rank = link_key.split('-')[0] + dst_rank = link_key.split('-')[1] + src_rank = project_local_global_rank_map[src_rank] \ + if src_rank in project_local_global_rank_map else src_rank + dst_rank = project_local_global_rank_map[dst_rank] \ + if dst_rank in project_local_global_rank_map else dst_rank + link_dict[Constant.BANDWIDTH_GB_S] = \ + self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), + link_dict.get(Constant.TRANSIT_TIME_MS, 0)) + tmp_link[f"{src_rank}-{dst_rank}"] = link_dict + return tmp_link + + project_local_global_rank_map = dict() + for op_name, op_dict in step_dict.items(): + link_info = defaultdict(lambda: { + Constant.TRANSPORT_TYPE: '', + Constant.TRANSIT_TIME_MS: 0, + Constant.TRANSIT_SIZE_MB: 0, + Constant.OP_NAME: '' + }) + for rank_id, rank_dict in op_dict.items(): + process_link_key() + step_dict[op_name] = convert_local_to_global_rank() + + def combine_link_info(self, step_dict: dict): + total_op_info = defaultdict(lambda: { + Constant.TRANSPORT_TYPE: '', + Constant.TRANSIT_TIME_MS: 0, + Constant.TRANSIT_SIZE_MB: 0, + Constant.OP_NAME: '' + }) + for op_name, op_dict in step_dict.items(): + if self.check_add_op(op_name): + for link_key, link_dict in op_dict.items(): + self.combine_link(total_op_info[link_key], link_dict) + for link_key, link_dict in total_op_info.items(): + link_dict[Constant.BANDWIDTH_GB_S] = \ + self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), + link_dict.get(Constant.TRANSIT_TIME_MS, 0)) + step_dict[Constant.TOTAL_OP_INFO] = total_op_info diff --git a/profiler/cluster_analyse_review/analysis/communication_analysis.py b/profiler/cluster_analyse_review/analysis/communication_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..3f0a9b417e211b124b052cb5c5534f2fdbe5302e --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/communication_analysis.py @@ -0,0 +1,103 @@ +import os +from collections import defaultdict + +from analysis.base_analysis import BaseAnalysis +from common_func.constant import Constant +from common_func.db_manager import DBManager + + +class CommunicationAnalysis(BaseAnalysis): + SAVED_JSON = "cluster_communication.json" + COMMUNICATION_BANDWIDTH_TABLE = "ClusterCommAnalyzerBandwidth" + COMMUNICATION_TIME_TABLE = "ClusterCommAnalyzerTime" + + def __init__(self, param: dict): + super().__init__(param) + self.communication_ops = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_OPS) + + @staticmethod + def combine_size_distribution(op_dict: dict, total_dict: dict): + for size, size_info in op_dict.items(): + total_dict[size][0] += size_info[0] + total_dict[size][1] += size_info[1] + + def run(self): + if not self.communication_ops: + return + self.split_op_by_group() + self.combine_ops_total_info() + self.dump_data() + + def dump_db(self): + res_comm_time, res_comm_bandwidth = self.adapter.transfer_comm_from_json_to_db(self.comm_ops_struct) + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.COMMUNICATION_TIME_TABLE, self.COMMUNICATION_BANDWIDTH_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + self.execute(conn, res_comm_time, self.COMMUNICATION_TIME_TABLE) + self.execute(conn, res_comm_bandwidth, self.COMMUNICATION_BANDWIDTH_TABLE) + DBManager.destroy_db_connect(conn, cursor) + + @staticmethod + def execute(conn, res_data, table_name): + if res_data: + res_value = [list(data.values()) for data in res_data] + sql = "insert into {} values ({value})".format(table_name, value="?," * (len(res_value[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res_value) + + def compute_total_info(self, comm_ops: dict): + if not comm_ops: + return + total_rank_dict = defaultdict(lambda: { + Constant.COMMUNICATION_TIME_INFO: defaultdict(float), + Constant.COMMUNICATION_BANDWIDTH_INFO: {} + }) + for communication_op, rank_dict in comm_ops.items(): + for rank_id, communication_op_info in rank_dict.items(): + for com_info, com_info_dict in communication_op_info.items(): + if com_info == Constant.COMMUNICATION_TIME_INFO: + self.combine_time_info(com_info_dict, total_rank_dict[rank_id][com_info]) + if com_info == Constant.COMMUNICATION_BANDWIDTH_INFO: + self.combine_bandwidth_info(com_info_dict, total_rank_dict[rank_id][com_info]) + for rank_id in total_rank_dict: + self.compute_time_ratio(total_rank_dict[rank_id][Constant.COMMUNICATION_TIME_INFO]) + self.compute_bandwidth_ratio(total_rank_dict[rank_id][Constant.COMMUNICATION_BANDWIDTH_INFO]) + comm_ops[Constant.TOTAL_OP_INFO] = total_rank_dict + + def combine_time_info(self, com_info_dict: dict, total_time_info_dict: dict): + ratio_list = [Constant.WAIT_TIME_RATIO, Constant.SYNCHRONIZATION_TIME_RATIO] + for time_info in com_info_dict: + if time_info not in ratio_list and time_info != Constant.START_TIMESTAMP: + total_time_info_dict[time_info] += com_info_dict.get(time_info) + + def combine_bandwidth_info(self, com_info_dict: dict, total_bandwidth_info_dict: dict): + add_list = [Constant.TRANSIT_TIME_MS, Constant.TRANSIT_SIZE_MB] + dict_list = [Constant.SIZE_DISTRIBUTION] + for transport_type, part_transport_dict in com_info_dict.items(): + if transport_type not in total_bandwidth_info_dict: + total_bandwidth_info_dict[transport_type] = { + Constant.TRANSIT_TIME_MS: 0, + Constant.TRANSIT_SIZE_MB: 0, + Constant.SIZE_DISTRIBUTION: defaultdict(lambda: [0, 0]) + } + for bandwidth_msg, value in part_transport_dict.items(): + if bandwidth_msg in add_list: + total_bandwidth_info_dict[transport_type][bandwidth_msg] += value + if bandwidth_msg in dict_list: + self.combine_size_distribution(value, total_bandwidth_info_dict[transport_type][bandwidth_msg]) + + def compute_time_ratio(self, total_time_info_dict: dict): + total_time_info_dict[Constant.WAIT_TIME_RATIO] = \ + self.compute_ratio(total_time_info_dict.get(Constant.WAIT_TIME_MS, 0), + total_time_info_dict.get(Constant.WAIT_TIME_MS, 0) + + total_time_info_dict.get(Constant.TRANSIT_TIME_MS, 0)) + total_time_info_dict[Constant.SYNCHRONIZATION_TIME_RATIO] = \ + self.compute_ratio(total_time_info_dict.get(Constant.SYNCHRONIZATION_TIME_MS, 0), + total_time_info_dict.get(Constant.SYNCHRONIZATION_TIME_MS, 0) + + total_time_info_dict.get(Constant.TRANSIT_TIME_MS, 0)) + + def compute_bandwidth_ratio(self, total_bandwidth_info_dict: dict): + for transport_type, bandwidth_dict in total_bandwidth_info_dict.items(): + bandwidth_dict[Constant.BANDWIDTH_GB_S] = \ + self.compute_ratio(bandwidth_dict.get(Constant.TRANSIT_SIZE_MB, 0), + bandwidth_dict.get(Constant.TRANSIT_TIME_MS, 0)) diff --git a/profiler/cluster_analyse_review/analysis/compute_op_sum/__init__.py b/profiler/cluster_analyse_review/analysis/compute_op_sum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/compute_op_sum/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/analysis/compute_op_sum/compute_op_sum.py b/profiler/cluster_analyse_review/analysis/compute_op_sum/compute_op_sum.py new file mode 100644 index 0000000000000000000000000000000000000000..e71cf868ac9e06785d030a702bf9c8182ae4e948 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/compute_op_sum/compute_op_sum.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +from analysis.base_analysis import BaseRecipeAnalysis +from common_func.constant import Constant +from common_func.utils import describe_duration +from cluster_statistics_export.compute_op_sum_export import ComputeOpSumExport + + +class ComputeOpSum(BaseRecipeAnalysis): + + TABLE_ALL_RANK_STATS = "ComputeOpAllRankStats" + TABLE_PER_RANK_STATS_BY_OPTYPE = "ComputeOpPerRankStatsByOpType" + TABLE_PER_RANK_STATS_BY_OPNAME = "ComputeOpPerRankStatsByOpName" + + def __init__(self, params): + super().__init__(params) + print("[INFO] ComputeOpSum init.") + self.all_rank_stats = None + self.per_rank_stats_by_optype = None + self.per_rank_stats_by_opname = None + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + @staticmethod + def _mapper_func(data_map, analysis_class): + df = ComputeOpSumExport(data_map[1], analysis_class).read_export_db() + + if df is None or df.empty: + print(f"[WARNING] There is no stats data in {data_map[1]}.") + return None + + df["Rank"] = data_map[0] + return df + + def mapper_func(self, context): + return context.wait( + context.map( + self._mapper_func, + self._get_rank_db(), + analysis_class=self._recipe_name + ) + ) + + def reducer_func(self, mapper_res): + mapper_res = list(filter(lambda df: df is not None, mapper_res)) + if not mapper_res: + print("[ERROR] Mapper data is None.") + return + # get per rank stats by optype + self.per_rank_stats_by_optype = pd.concat( + describe_duration(df.groupby(["OpType", "TaskType"])["Duration"]).assign(Rank=df["Rank"][0]) for df in mapper_res) + self.per_rank_stats_by_optype.sort_values(by=["SumNs"], inplace=True, ascending=False) + + # get all rank stats by optype + all_op_data = pd.concat(mapper_res) + self.all_rank_stats = describe_duration(all_op_data.groupby(["OpType", "TaskType"])["Duration"]) + self.all_rank_stats.sort_values(by=["SumNs"], inplace=True, ascending=False) + + # get per rank stats by opname + self.per_rank_stats_by_opname = pd.concat( + describe_duration(df.groupby(["OpName", "OpType", "TaskType", "InputShapes"])["Duration"]).assign(Rank=df["Rank"][0]) for df in mapper_res) + self.per_rank_stats_by_opname.sort_values(by=["SumNs"], inplace=True, ascending=False) + + def run(self, context): + super().run(context) + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + + if self._export_type == "db": + self.save_db() + elif self._export_type == "notebook": + self.save_notebook() + else: + print("[ERROR] Unknown export type.") + + def save_notebook(self): + self.dump_data(self.all_rank_stats, os.path.join(self._get_output_dir(), "all_stats.csv")) + self.dump_data(self.per_rank_stats_by_optype, os.path.join(self._get_output_dir(), "rank_stats_by_optype.csv")) + self.dump_data(self.per_rank_stats_by_opname, os.path.join(self._get_output_dir(), "rank_stats_by_opname.csv")) + self.create_notebook("stats.ipynb") + self.add_helper_file("cluster_display.py") + + def save_db(self): + self.dump_data(self.all_rank_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_ALL_RANK_STATS) + self.dump_data(self.per_rank_stats_by_optype, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_PER_RANK_STATS_BY_OPTYPE) + self.dump_data(self.per_rank_stats_by_opname, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_PER_RANK_STATS_BY_OPNAME) diff --git a/profiler/cluster_analyse_review/analysis/compute_op_sum/stats.ipynb b/profiler/cluster_analyse_review/analysis/compute_op_sum/stats.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c88d2684c1f8822818f62005355c444332aaa915 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/compute_op_sum/stats.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compute Op Summary\n", + "\n", + "集群场景计算类算子数据分析\n", + "\n", + "主要包含以下3个统计内容:\n", + "1. 按算子类型和任务类型分组的,整个集群通信算子耗时的统计情况\n", + "2. 按算子类型和任务类型分组的,每个Rank上计算类算子的耗时情况\n", + "3. 按算子名称、任务类型、输入shape分组的,每个Rank上的计算类算子的耗时情况" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "import plotly.offline as pyo\n", + "\n", + "def is_lab_notebook():\n", + " import re\n", + " import psutil\n", + " return any(re.search('jupyter--lab-script', x) for x in psutil.Process().parent().cmdline())\n", + "\n", + "if is_lab_notebook():\n", + " pyo.init_notebook_mode()\n", + "\n", + "import pandas as pd\n", + "pd.options.plotting.backend = \"plotly\"\n", + "pd.set_option(\"display.max_rows\", 100)\n", + "pd.set_option(\"display.width\", 1000)\n", + "\n", + "import cluster_display\n", + "\n", + "all_stats_df = pd.read_csv(\"all_stats.csv\", index_col=\"OpType\")\n", + "rank_stats_by_optype_df = pd.read_csv(\"rank_stats_by_optype.csv\", index_col=\"OpType\")\n", + "rank_stats_by_opname_df = pd.read_csv(\"rank_stats_by_opname.csv\", index_col=\"OpName\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 计算类算子耗时分析\n", + "\n", + "将整个集群所有Rank的计算类算子进行汇总,按算子类型和任务类型分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(all_stats_df)\n", + "fig_all_rank = cluster_display.display_duration_boxplots(None, all_stats_df, x_title=\"OpType\")\n", + "fig_per_rank = cluster_display.display_graph(None, all_stats_df.index, all_stats_df[[\"Q1(Us)\", \"Median(Us)\", \"Q3(Us)\"]], title=\"50% of Distribution\", x_title=\"OpType\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 单个Rank的计算类算子基于算子类型的耗时分析\n", + "将集群内每个Rank的计算类算子进行汇总,按算子类型和任务类型分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rank_stats_gdf = rank_stats_by_optype_df.groupby(rank_stats_by_optype_df.index)\n", + "cluster_display.display_stats_per_rank_groups_combobox(rank_stats_gdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 单个Rank的计算类算子基于算子名的耗时分析\n", + "\n", + "将集群内每个Rank的计算类算子进行汇总,按算子名称、任务类型、输入shape分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rank_stats_gdf = rank_stats_by_opname_df.groupby(rank_stats_by_opname_df.index)\n", + "cluster_display.display_stats_per_rank_groups_combobox(rank_stats_gdf)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse_review/analysis/hccl_sum/__init__.py b/profiler/cluster_analyse_review/analysis/hccl_sum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/hccl_sum/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/analysis/hccl_sum/hccl_sum.py b/profiler/cluster_analyse_review/analysis/hccl_sum/hccl_sum.py new file mode 100644 index 0000000000000000000000000000000000000000..da0c575e4683f1c51c4cf38e89b9c096c484777e --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/hccl_sum/hccl_sum.py @@ -0,0 +1,133 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +from analysis.base_analysis import BaseRecipeAnalysis +from common_func.constant import Constant +from common_func.utils import describe_duration +from cluster_statistics_export.hccl_sum_export import HcclSumExport + + +class HcclSum(BaseRecipeAnalysis): + + TABLE_ALL_RANK_STATS = "HcclAllRankStats" + TABLE_PER_RANK_STATS = "HcclPerRankStats" + TABLE_TOP_OP_STATS = "HcclTopOpStats" + + TOP_NUM = "top_num" + DEFAULT_TOP_NUM = 15 + + def __init__(self, params): + super().__init__(params) + print("[INFO] HcclSum init.") + self.per_rank_stats = None + self.all_rank_stats = None + self.top_op_stats = None + self.top_num = params.get(self.TOP_NUM, self.DEFAULT_TOP_NUM) + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + @staticmethod + def _mapper_func(data_map, analysis_class): + df = HcclSumExport(data_map[1], analysis_class).read_export_db() + + if df is None or df.empty: + print(f"[WARNING] There is no stats data in {data_map[1]}.") + return None + + df["Rank"] = data_map[0] + return df + + def mapper_func(self, context): + return context.wait( + context.map( + self._mapper_func, + self._get_rank_db(), + analysis_class=self._recipe_name + ) + ) + + def reducer_func(self, mapper_res): + mapper_res = list(filter(lambda df: df is not None, mapper_res)) + if not mapper_res: + print("[ERROR] Mapper data is None.") + return + self.per_rank_stats = pd.concat( + describe_duration(df.groupby("OpType")["Duration"]).assign(Rank=df["Rank"][0]) for df in mapper_res) + self.per_rank_stats.sort_values(by=["Rank"], inplace=True) + all_op_data = pd.concat(mapper_res) + self.all_rank_stats = describe_duration(all_op_data.groupby("OpType")["Duration"]) + grouped_op_stats = all_op_data.groupby("OpName") + self.top_op_stats = describe_duration(grouped_op_stats["Duration"]).nlargest(self.top_num, "MeanNs") + min_rank = [] + max_rank = [] + for op_name in self.top_op_stats.index: + df = grouped_op_stats.get_group(op_name) + min_rank.append(df[df["Duration"] == df["Duration"].min()]["Rank"].values[0]) + max_rank.append(df[df["Duration"] == df["Duration"].max()]["Rank"].values[0]) + self.top_op_stats["MinRank"] = min_rank + self.top_op_stats["MaxRank"] = max_rank + + def run(self, context): + super().run(context) + if self.top_num <= 0: + print(f"[WARNING] HcclSum: top_num is set to a invalid value, " + f"it will be reset to default value({self.DEFAULT_TOP_NUM}).") + self.top_num = self.DEFAULT_TOP_NUM + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + + if self._export_type == "db": + self.save_db() + elif self._export_type == "notebook": + self.save_notebook() + else: + print("[ERROR] Unknown export type.") + + def save_notebook(self): + self.dump_data(self.all_rank_stats, os.path.join(self._get_output_dir(), "all_stats.csv")) + self.dump_data(self.per_rank_stats, os.path.join(self._get_output_dir(), "rank_stats.csv")) + self.dump_data(self.top_op_stats, os.path.join(self._get_output_dir(), "top_op_stats.csv")) + self.create_notebook("stats.ipynb") + self.add_helper_file("cluster_display.py") + + def save_db(self): + self.dump_data(self.all_rank_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_ALL_RANK_STATS) + self.dump_data(self.per_rank_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_PER_RANK_STATS) + self.dump_data(self.top_op_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_TOP_OP_STATS) + + @classmethod + def add_parser_argument(cls, parser): + BaseRecipeAnalysis.add_parser_argument(parser) + parser.add_argument("--top_num", type=int, help="Duration cost top count", default=cls.DEFAULT_TOP_NUM) + + @classmethod + def parse_argument(cls, args_parsed) -> dict: + argument_dict = BaseRecipeAnalysis.parse_argument(args_parsed) + argument_dict.update({ + cls.TOP_NUM: args_parsed.top_num + }) + return argument_dict + + @classmethod + def get_extra_argument(cls, params) -> dict: + argument_dict = BaseRecipeAnalysis.get_extra_argument(params) + argument_dict.update({ + cls.TOP_NUM: params.get(cls.TOP_NUM, cls.DEFAULT_TOP_NUM) + }) + return argument_dict diff --git a/profiler/cluster_analyse_review/analysis/hccl_sum/stats.ipynb b/profiler/cluster_analyse_review/analysis/hccl_sum/stats.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..87f8c6d736240531e2c28c0cf33df087ecfe38e8 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/hccl_sum/stats.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HCCL Summary\n", + "\n", + "集群场景Hccl算子数据分析\n", + "\n", + "主要包含以下3个统计内容:\n", + "1. 按算子类型分组的,整个集群通信算子耗时的统计情况\n", + "2. 按算子类型分组的,每个Rank上通信算子的耗时情况\n", + "3. 整个集群平均耗时最久的TOP通信算子" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "import plotly.offline as pyo\n", + "\n", + "def is_lab_notebook():\n", + " import re\n", + " import psutil\n", + " return any(re.search('jupyter--lab-script', x) for x in psutil.Process().parent().cmdline())\n", + "\n", + "if is_lab_notebook():\n", + " pyo.init_notebook_mode()\n", + "\n", + "import pandas as pd\n", + "pd.options.plotting.backend = \"plotly\"\n", + "pd.set_option(\"display.max_rows\", 100)\n", + "pd.set_option(\"display.width\", 1000)\n", + "\n", + "import cluster_display\n", + "\n", + "all_stats_df = pd.read_csv(\"all_stats.csv\", index_col=\"OpType\")\n", + "rank_stats_df = pd.read_csv(\"rank_stats.csv\", index_col=\"OpType\")\n", + "top_op_stats_df = pd.read_csv(\"top_op_stats.csv\", index_col=\"OpName\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群通信算子耗时分析\n", + "\n", + "将整个集群所有Rank的通信算子进行汇总,按算子类型分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(all_stats_df)\n", + "fig_all_rank = cluster_display.display_duration_boxplots(None, all_stats_df, x_title=\"Hccl OpType\")\n", + "fig_per_rank = cluster_display.display_graph(None, all_stats_df.index, all_stats_df[[\"Q1(Us)\", \"Median(Us)\", \"Q3(Us)\"]], title=\"50% of Distribution\", x_title=\"Hccl OpType\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群Rank通信算子耗时分析\n", + "\n", + "将集群内每个Rank的通信算子进行汇总,按算子类型分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rank_stats_gdf = rank_stats_df.groupby(rank_stats_df.index)\n", + "cluster_display.display_stats_per_rank_groups_combobox(rank_stats_gdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群TOP-N通信算子耗时分析\n", + "\n", + "统计集群内耗时最多的TOP-N通信算子,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时\n", + "- MinRank:耗时最少算子所在的Rank\n", + "- MaxRank:耗时最长算子所在的Rank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(top_op_stats_df)\n", + "fig_top_op = cluster_display.display_duration_boxplots(None, top_op_stats_df, x_title=\"Hccl OpName\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse_review/analysis/host_info_analysis.py b/profiler/cluster_analyse_review/analysis/host_info_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..563711080ed3a20923ce73ec595b84892492e9f6 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/host_info_analysis.py @@ -0,0 +1,96 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from analysis.base_analysis import BaseAnalysis +from common_func.constant import Constant +from common_func.db_manager import DBManager + + +class HostInfoAnalysis(BaseAnalysis): + + TABLE_HOST_INFO = "HOST_INFO" + TABLE_RANK_DEVICE_MAP = "RANK_DEVICE_MAP" + + def __init__(self, param: dict): + super().__init__(param) + self.all_rank_host_info = {} + self.all_rank_device_info = [] + + def run(self): + if self.data_type != Constant.DB: + return + self.analyze_host_info() + self.dump_db() + + def dump_db(self): + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + conn, curs = DBManager.create_connect_db(result_db) + if not (conn and curs): + print(f"[ERROR] Failed to create db {Constant.DB_CLUSTER_COMMUNICATION_ANALYZER}") + return + self.dump_host_info(result_db, conn) + self.dump_rank_device_map(result_db, conn) + DBManager.destroy_db_connect(conn, curs) + + def dump_host_info(self, result_db, db_conn): + if not self.all_rank_host_info: + print(f"[WARNING] No host info data be analyzed.") + return + DBManager.create_tables(result_db, Constant.TABLE_HOST_INFO) + save_host_info = list(self.all_rank_host_info.items()) + sql = "insert into {} values ({value})".format(Constant.TABLE_HOST_INFO, + value="?," * (len(save_host_info[0]) - 1) + "?") + DBManager.executemany_sql(db_conn, sql, save_host_info) + + def dump_rank_device_map(self, result_db, db_conn): + if not self.all_rank_device_info: + print(f"[WARNING] No rank device map data be analyzed.") + return + self.all_rank_device_info.sort() + DBManager.create_tables(result_db, Constant.TABLE_RANK_DEVICE_MAP) + sql = "insert into {} values ({value})".format(Constant.TABLE_RANK_DEVICE_MAP, + value="?," * (len(self.all_rank_device_info[0]) - 1) + "?") + DBManager.executemany_sql(db_conn, sql, self.all_rank_device_info) + + def analyze_host_info(self): + print_empty_host_info = "" + for rank_id, profiling_dir in self.data_map.items(): + host_info = [] + rank_device_info = [] + db_path = os.path.join(profiling_dir, Constant.SINGLE_OUTPUT, f"ascend_pytorch_profiler_{rank_id}.db") + if (os.path.exists(db_path) and DBManager.check_tables_in_db(db_path, self.TABLE_HOST_INFO)): + conn, curs = DBManager.create_connect_db(db_path) + sql = "select * from {0}".format(self.TABLE_HOST_INFO) + host_info = DBManager.fetch_all_data(curs, sql, is_dict=False) + DBManager.destroy_db_connect(conn, curs) + if not (host_info and host_info[0]): + if not print_empty_host_info: + print_empty_host_info = f"[WARNING] No {self.TABLE_HOST_INFO} data in {self.data_type} file." + continue + if (os.path.exists(db_path) and DBManager.check_tables_in_db(db_path, self.TABLE_RANK_DEVICE_MAP)): + conn, curs = DBManager.create_connect_db(db_path) + sql = "select * from {0}".format(self.TABLE_RANK_DEVICE_MAP) + rank_device_info = DBManager.fetch_all_data(curs, sql, is_dict=False) + DBManager.destroy_db_connect(conn, curs) + host_uid, host_name = host_info[0][0], host_info[0][1] + for idx, data in enumerate(rank_device_info): + rank_device_info[idx] = list(data) + [host_uid, ] + self.all_rank_host_info[host_uid] = host_name + self.all_rank_device_info.extend(rank_device_info) + if print_empty_host_info: + print(print_empty_host_info) diff --git a/profiler/cluster_analyse_review/analysis/mstx_sum/__init__.py b/profiler/cluster_analyse_review/analysis/mstx_sum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/mstx_sum/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/analysis/mstx_sum/mstx_sum.py b/profiler/cluster_analyse_review/analysis/mstx_sum/mstx_sum.py new file mode 100644 index 0000000000000000000000000000000000000000..46a0e18abeee5cdd6b058d71e3a1bd2b97e7c29d --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/mstx_sum/mstx_sum.py @@ -0,0 +1,204 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +from collections import namedtuple +from analysis.base_analysis import BaseRecipeAnalysis +from common_func.constant import Constant +from common_func.utils import describe_duration +from cluster_statistics_export.mstx_mark_export import MstxMarkExport +from cluster_statistics_export.mstx_step_export import MstxStepExport + + +MarkInfo = namedtuple("MarkInfo", ["name", "framework_duration", "cann_duration", "device_duration", + "tid", "start_ns"]) + + +def format_mark_info(df: pd.DataFrame, start_idx, stop_idx, name) -> MarkInfo: + start_series = df.iloc[start_idx] + stop_series = df.iloc[stop_idx] + return MarkInfo( + name=name, + framework_duration=float(stop_series["framework_ts"]-start_series["framework_ts"]), + cann_duration=float(stop_series["cann_ts"]-start_series["cann_ts"]), + device_duration=float(stop_series["device_ts"]-start_series["device_ts"]), + tid=start_series["tid"], + start_ns=start_series["cann_ts"] + ) + + +def rename_mark_msg_name(mark_stats_df: pd.DataFrame): + msg_idx_counter = {} + for idx, mark_info in enumerate(mark_stats_df.itertuples(index=False)): + msg_idx_counter.setdefault(mark_info.step_id, {}).setdefault(mark_info.name, []).append(idx) + for msg_dict in msg_idx_counter.values(): + for msg, idx_list in msg_dict.items(): + if len(idx_list) <= 1: + continue + for i, idx in enumerate(idx_list): + mark_stats_df.loc[idx, 'name'] = f"{msg}_{i}" + + +def compute_step_id(mark_stat, step_stats_df: pd.DataFrame): + for step_info in step_stats_df.itertuples(index=False): + if step_info.start_ns <= mark_stat.start_ns <= step_info.end_ns: + return step_info.step_id + print(f"[WARNING] {mark_stat.name} is not in any step.") + return 0 + + +def format_columns(df: pd.DataFrame): + formatted_df = df.rename( + { + "framework_duration": "FrameworkDurationNs", + "cann_duration": "CannDurationNs", + "device_duration": "DeviceDurationNs", + "duration": "DurationNs", + "step_id": "StepId", + "tid": "Tid", + "name": "Name" + }, + axis="columns" + ) + cols = [col for col in formatted_df.columns if not col.endswith("_ns") and col not in {"Tid"}] + return formatted_df[cols] + + +class MstxSum(BaseRecipeAnalysis): + + TABLE_FRAMEWORK_STATS = "MSTXAllFrameworkStats" + TABLE_CANN_STATS = "MSTXAllCannStats" + TABLE_DEVICE_STATS = "MSTXAllDeviceStats" + TABLE_MARK_STATS = "MSTXMarkStats" + + START_SUFFIX = "_start" + STOP_SUFFIX = "_stop" + + def __init__(self, params): + super().__init__(params) + print("[INFO] MstxSum init.") + self.mark_stats = None + self.all_fwk_stats = None + self.all_cann_stats = None + self.all_device_stats = None + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + @staticmethod + def _mapper_func(data_map, analysis_class): + step_df = MstxStepExport(data_map[1], analysis_class).read_export_db() + if step_df is None or step_df.empty: + step_df = pd.DataFrame({"start_ns": [0], "end_ns": [float("inf")], "step_id": [0]}) + mark_df = MstxMarkExport(data_map[1], analysis_class).read_export_db() + if mark_df is None or mark_df.empty: + print(f"[WARNING] There is no mark data in {data_map[1]}.") + return None + mark_df["framework_ts"] = mark_df["framework_ts"].astype("int64") + + mark_info = {} + mark_res = [] + mismatch_msg = [] + for idx, row in enumerate(mark_df.itertuples(index=False)): + if row.msg.endswith(MstxSum.START_SUFFIX): + msg = row.msg[:-len(MstxSum.START_SUFFIX)] + mark_info.setdefault(row.tid, {}).setdefault(msg, []).append(idx) + elif row.msg.endswith(MstxSum.STOP_SUFFIX): + msg = row.msg[:-len(MstxSum.STOP_SUFFIX)] + idx_list = mark_info.get(row.tid, {}).get(msg, []) + if not idx_list: + mismatch_msg.append((row.msg, idx)) + continue + start_idx = idx_list.pop() + mark_res.append(format_mark_info(mark_df, start_idx, idx, msg)) + + # 统计未匹配上的mark信息 + for msg_info in mark_info.values(): + for msg, idx_list in msg_info.items(): + if not idx_list: + continue + mismatch_msg.extend((msg + MstxSum.START_SUFFIX, idx) for idx in idx_list) + if mismatch_msg: + mismatch_msg.sort(key=lambda msg: msg[1]) + print(f"[WARNING] The following mark messages do not match anyone in " + f"rank {data_map[0]}: {','.join(msg[0] for msg in mismatch_msg)}.") + + mark_stats_df = pd.DataFrame(mark_res).assign(Rank=data_map[0]) + mark_stats_df["step_id"] = mark_stats_df.apply(compute_step_id, axis=1, step_stats_df=step_df) + rename_mark_msg_name(mark_stats_df) + mark_stats_df = format_columns(mark_stats_df).set_index("Name", drop=True) + return mark_stats_df + + def mapper_func(self, context): + return context.wait( + context.map( + self._mapper_func, + self._get_rank_db(), + analysis_class=self._recipe_name + ) + ) + + def reducer_func(self, mapper_res): + mapper_res = list(filter(lambda df: df is not None, mapper_res)) + if not mapper_res: + print("[ERROR] Mapper data is None.") + return + self.mark_stats = pd.concat(mapper_res) + all_fwk_stats = [] + all_cann_stats = [] + all_device_stats = [] + mark_step_df = self.mark_stats.groupby("StepId") + for step_id, df in mark_step_df: + name_gdf = df.groupby("Name") + fwk_stats = describe_duration(name_gdf["FrameworkDurationNs"]).assign(StepId=step_id) + fwk_stats.sort_values(by=["SumNs"], inplace=True, ascending=False) + all_fwk_stats.append(fwk_stats) + cann_stats = describe_duration(name_gdf["CannDurationNs"]).assign(StepId=step_id) + cann_stats.sort_values(by=["SumNs"], inplace=True, ascending=False) + all_cann_stats.append(cann_stats) + device_stats = describe_duration(name_gdf["DeviceDurationNs"]).assign(StepId=step_id) + device_stats.sort_values(by=["SumNs"], inplace=True, ascending=False) + all_device_stats.append(device_stats) + self.all_fwk_stats = pd.concat(all_fwk_stats) + self.all_cann_stats = pd.concat(all_cann_stats) + self.all_device_stats = pd.concat(all_device_stats) + + def run(self, context): + super().run(context) + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + + if self._export_type == "db": + self.save_db() + elif self._export_type == "notebook": + self.save_notebook() + else: + print("[ERROR] Unknown export type.") + + def save_notebook(self): + self.dump_data(self.mark_stats, os.path.join(self._get_output_dir(), "mark_stats.csv")) + self.dump_data(self.all_fwk_stats, os.path.join(self._get_output_dir(), "all_fwk_stats.csv")) + self.dump_data(self.all_cann_stats, os.path.join(self._get_output_dir(), "all_cann_stats.csv")) + self.dump_data(self.all_device_stats, os.path.join(self._get_output_dir(), "all_device_stats.csv")) + self.create_notebook("stats.ipynb") + self.add_helper_file("cluster_display.py") + + def save_db(self): + self.dump_data(self.mark_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_MARK_STATS) + self.dump_data(self.all_fwk_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_FRAMEWORK_STATS) + self.dump_data(self.all_cann_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_CANN_STATS) + self.dump_data(self.all_device_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_DEVICE_STATS) diff --git a/profiler/cluster_analyse_review/analysis/mstx_sum/stats.ipynb b/profiler/cluster_analyse_review/analysis/mstx_sum/stats.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..84672bc72b97b02717c3a4110ab1b4dd827adafd --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/mstx_sum/stats.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MSTX Summary\n", + "\n", + "集群场景MSTX打点数据分析\n", + "\n", + "主要包含以下2个统计内容:\n", + "1. 按Step分组的,整个集群MSTX打点数据的统计情况\n", + "2. 按Name分组的,每个Rank上MSTX打点数据的统计情况" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "import plotly.offline as pyo\n", + "\n", + "def is_lab_notebook():\n", + " import re\n", + " import psutil\n", + " return any(re.search('jupyter--lab-script', x) for x in psutil.Process().parent().cmdline())\n", + "\n", + "if is_lab_notebook():\n", + " pyo.init_notebook_mode()\n", + "\n", + "import pandas as pd\n", + "pd.options.plotting.backend = \"plotly\"\n", + "pd.set_option(\"display.max_rows\", 100)\n", + "pd.set_option(\"display.width\", 1000)\n", + "\n", + "import cluster_display\n", + "\n", + "all_fwk_stats_gdf = pd.read_csv(\"all_fwk_stats.csv\", index_col=\"Name\").groupby(\"StepId\")\n", + "all_cann_stats_gdf = pd.read_csv(\"all_cann_stats.csv\", index_col=\"Name\").groupby(\"StepId\")\n", + "all_device_stats_gdf = pd.read_csv(\"all_device_stats.csv\", index_col=\"Name\").groupby(\"StepId\")\n", + "mark_stats_df = pd.read_csv(\"mark_stats.csv\", index_col=\"Name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群MSTX数据分析\n", + "\n", + "将整个集群所有Rank的MSTX数据进行汇总,按Step划分,统计分析耗时情况,时间单位为微秒(us)\n", + "打点数据分为三种:\n", + "1. 框架侧耗时:Framework Time\n", + "2. Cann侧耗时:Cann Time\n", + "3. Device侧耗时:Devcie Time\n", + "\n", + "3种数据都包含以下统计项:\n", + "- Count:数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def display_stats_mstx_step_combobox(selected, args):\n", + " step = selected\n", + " fwk_stats_gdf, cann_stats_gdf, device_stats_gdf = args\n", + " fwk_df = fwk_stats_gdf.get_group(step)\n", + " cann_df = cann_stats_gdf.get_group(step)\n", + " device_df = device_stats_gdf.get_group(step)\n", + " figs = []\n", + " display(HTML(\"

Framework Time Stats

\"))\n", + " display(fwk_df)\n", + " cluster_display.display_duration_boxplots(figs, fwk_df, title=\"Framework Time\", x_title=\"Name\", y_title=\"Time\")\n", + " display(HTML(\"

Cann Time Stats

\"))\n", + " display(cann_df)\n", + " cluster_display.display_duration_boxplots(figs, cann_df, title=\"Cann Time\", x_title=\"Name\", y_title=\"Time\")\n", + " display(HTML(\"

Device Time Stats

\"))\n", + " display(device_df)\n", + " cluster_display.display_duration_boxplots(figs, device_df, title=\"Device Time\", x_title=\"Name\", y_title=\"Time\")\n", + "\n", + "steps = list(all_fwk_stats_gdf.groups.keys())\n", + "if steps:\n", + " cluster_display.display_stats_optional_combobox(steps, display_stats_mstx_step_combobox, \n", + " [all_fwk_stats_gdf, all_cann_stats_gdf, all_device_stats_gdf], \"Step:\")\n", + "else:\n", + " print(\"There is no step in stats, so no need to display\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群Rank MSTX数据分析\n", + "\n", + "将集群内每个Rank的MSTX数据进行汇总,按打点Name分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Name:打点名称\n", + "- FrameworkDuration(Us):框架侧耗时\n", + "- CannDuration(Us):Cann侧耗时\n", + "- DeviceDuration(Us):Device侧耗时\n", + "- Rank:Rank序号\n", + "- StepId:Step序号" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def display_mstx_duration_by_rank(selected, args):\n", + " mark_stats_gdf = args\n", + " df = mark_stats_gdf.get_group(selected).sort_values(\"Rank\")\n", + " display(df)\n", + " fwk_duration = []\n", + " cann_duration = []\n", + " device_duration = []\n", + " step_ids = []\n", + " for step_id, step_df in df.groupby(\"StepId\"):\n", + " fwk_duration.append((step_id, step_df[\"FrameworkDuration(Us)\"].values))\n", + " cann_duration.append((step_id, step_df[\"CannDuration(Us)\"].values))\n", + " device_duration.append((step_id, step_df[\"DeviceDuration(Us)\"].values))\n", + " step_ids.append(step_id)\n", + " fwk_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in fwk_duration], axis=1)\n", + " cann_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in cann_duration], axis=1)\n", + " device_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in device_duration], axis=1)\n", + " figs = []\n", + " ranks = df[\"Rank\"].drop_duplicates()\n", + " cluster_display.display_graph(figs, ranks, fwk_df[step_ids],\n", + " title=\"Framework Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + " cluster_display.display_graph(figs, ranks, cann_df[step_ids],\n", + " title=\"Cann Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + " cluster_display.display_graph(figs, ranks, device_df[step_ids],\n", + " title=\"Device Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + "\n", + "mark_stats_gdf = mark_stats_df.groupby(mark_stats_df.index)\n", + "names = list(mark_stats_gdf.groups.keys())\n", + "if steps:\n", + " cluster_display.display_stats_optional_combobox(names, display_mstx_duration_by_rank, mark_stats_gdf, \"Name:\")\n", + "else:\n", + " print(\"There is no mark name in stats, so no need to display\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse_review/analysis/step_trace_time_analysis.py b/profiler/cluster_analyse_review/analysis/step_trace_time_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..6a886fffa97b142e8267066117f561154d85b162 --- /dev/null +++ b/profiler/cluster_analyse_review/analysis/step_trace_time_analysis.py @@ -0,0 +1,126 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from common_func.db_manager import DBManager +from common_func.constant import Constant +from common_func.file_manager import FileManager +from prof_bean.step_trace_time_bean import StepTraceTimeBean + + +class StepTraceTimeAnalysis: + CLUSTER_TRACE_TIME_CSV = "cluster_step_trace_time.csv" + CLUSTER_TRACE_TIME_TABLE = "ClusterStepTraceTime" + + def __init__(self, param: dict): + self.collection_path = param.get(Constant.COLLECTION_PATH) + self.data_map = param.get(Constant.DATA_MAP) + self.communication_group = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_GROUP) + self.step_time_dict = {} + self.step_data_list = [] + self.data_type = param.get(Constant.DATA_TYPE) + + @staticmethod + def get_max_data_row(data_group_list: list): + if not data_group_list: + return [] + ret = [] + for idx in range(len(data_group_list[0])): + max_val = 0 + for idy in range(len(data_group_list)): + max_val = max(max_val, data_group_list[idy][idx]) + ret.append(max_val) + return ret + + def run(self): + self.load_step_trace_time_data() + self.analyze_step_time() + self.dump_data() + + def dump_data(self): + if not self.step_data_list: + print("[WARNING] Can't get step time info!") + return + if self.data_type == Constant.TEXT: + headers = self.get_headers() + FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers) + else: + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.CLUSTER_TRACE_TIME_TABLE) + column_len = DBManager.get_table_column_count(result_db, self.CLUSTER_TRACE_TIME_TABLE) + data_len = len(self.step_data_list[0]) + if data_len < column_len: + for data in self.step_data_list: + data.extend([0] * (column_len - data_len)) + conn, cursor = DBManager.create_connect_db(result_db) + sql = "insert into {} values ({value})".format(self.CLUSTER_TRACE_TIME_TABLE, + value="?," * (len(self.step_data_list[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, self.step_data_list) + DBManager.destroy_db_connect(conn, cursor) + + def load_step_trace_time_data(self): + for rank_id, profiling_dir_path in self.data_map.items(): + if self.data_type == Constant.TEXT: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV) + if os.path.exists(step_time_file): + self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean) + else: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, + Constant.DB_COMMUNICATION_ANALYZER) + if (os.path.exists(step_time_file) and + DBManager.check_tables_in_db(step_time_file, Constant.TABLE_STEP_TRACE)): + conn, cursor = DBManager.create_connect_db(step_time_file) + sql = "select * from {0}".format(Constant.TABLE_STEP_TRACE) + data = DBManager.fetch_all_data(cursor, sql, is_dict=False) + self.step_time_dict[rank_id] = data + DBManager.destroy_db_connect(conn, cursor) + if not self.step_time_dict.get(rank_id): + print(f"[WARNING] Rank {rank_id} does not have a valid step_trace_time data in {self.data_type} file.") + + def analyze_step_time(self): + for rank_id, data_bean_list in self.step_time_dict.items(): + for data_bean in data_bean_list: + if self.data_type == Constant.TEXT: + self.step_data_list.append([data_bean.step, Constant.RANK, rank_id] + data_bean.row) + else: + self.step_data_list.append([data_bean[0], Constant.RANK, rank_id] + list(data_bean[1:])) + stage_list = self.communication_group.get(Constant.P2P) + if not stage_list: + return + step_group_dict = {} + for data_list in self.step_data_list: + stage_group = tuple() + for stage in stage_list: + if data_list[2] in stage: + stage_group = tuple(stage) + break + key = (data_list[0], stage_group) + step_group_dict.setdefault(key, []).append(data_list[3:]) + + for key, data_group_list in step_group_dict.items(): + if self.data_type == Constant.TEXT: + self.step_data_list.append([key[0], Constant.STAGE, key[1]] + self.get_max_data_row(data_group_list)) + else: + index = "(" + ",".join(str(i) for i in key[1]) + ")" + self.step_data_list.append([key[0], Constant.STAGE, index] + self.get_max_data_row(data_group_list)) + + def get_headers(self): + if self.step_time_dict: + for rank in self.step_time_dict: + if self.step_time_dict.get(rank): + return self.step_time_dict[rank][0].all_headers + return [] diff --git a/profiler/cluster_analyse_review/cluster_analysis.py b/profiler/cluster_analyse_review/cluster_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..a8d01dcfe348be6b47c0a71099cedab64b6b3e06 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_analysis.py @@ -0,0 +1,148 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor +from cluster_data_preprocess.mindspore_data_preprocessor import MindsporeDataPreprocessor +from communication_group.communication_group_generator import CommunicationGroupGenerator +from common_func.constant import Constant +from common_func.file_manager import FileManager +from common_func.path_manager import PathManager +from common_func import analysis_loader +from analysis.analysis_facade import AnalysisFacade + +COMM_FEATURE_LIST = ['all', 'communication_time', 'communication_matrix'] +ALL_FEATURE_LIST = ['all', 'communication_time', 'communication_matrix', 'cann_api_sum', 'hccl_sum', 'compute_op_sum', + 'mstx_sum'] + + +def get_analysis_args(analysis_class, analysis_args): + parser = argparse.ArgumentParser(description="custom analysis args") + parser.add_argument("--parallel_mode", type=str, help="context mode", default="concurrent") + parser.add_argument("--export_type", type=str, help="export type", default="db") + analysis_class[1].add_parser_argument(parser) + return parser.parse_args(analysis_args) + +def parse_specific_params(analysis_name, analysis_args): + analysis_class = analysis_loader.get_class_from_name(analysis_name) + if not analysis_class: + print("[ERROR] undefined analysis.") + return None + + args_parsed = get_analysis_args(analysis_class, analysis_args) + specific_params = { + Constant.RECIPE_NAME: analysis_class[0], + Constant.RECIPE_CLASS: analysis_class[1], + Constant.PARALLEL_MODE: args_parsed.parallel_mode, + Constant.EXPORT_TYPE: args_parsed.export_type + } + specific_params.update(analysis_class[1].parse_argument(args_parsed)) + return specific_params + +class Interface: + ASCEND_PT = "ascend_pt" + ASCEND_MS = "ascend_ms" + + + def __init__(self, params: dict): + self.collection_path = PathManager.get_realpath(params.get(Constant.COLLECTION_PATH)) + self.analysis_mode = params.get(Constant.ANALYSIS_MODE) + self.data_map = {} + self.communication_group = {} + self.collective_group_dict = {} + self.communication_ops = [] + self.matrix_ops = [] + self.origin_params = params + + def allocate_prof_data(self): + ascend_pt_dirs = [] + ascend_ms_dirs = [] + for root, dirs, files in os.walk(self.collection_path): + for dir_name in dirs: + if dir_name.endswith(self.ASCEND_PT): + ascend_pt_dirs.append(os.path.join(root, dir_name)) + if dir_name.endswith(self.ASCEND_MS): + ascend_ms_dirs.append(os.path.join(root, dir_name)) + pytorch_processor = PytorchDataPreprocessor(ascend_pt_dirs) + pt_data_map = pytorch_processor.get_data_map() + data_type = pytorch_processor.get_data_type() + ms_data_map = MindsporeDataPreprocessor(ascend_ms_dirs).get_data_map() + if pt_data_map and ms_data_map: + print("[ERROR] Can not analyze pytorch and mindspore meantime.") + return [] + return (pt_data_map, data_type) if pt_data_map else (ms_data_map, Constant.TEXT) + + def run(self): + PathManager.check_input_directory_path(self.collection_path) + PathManager.check_path_owner_consistent(self.collection_path) + data_map, data_type = self.allocate_prof_data() + if not data_map: + print("[WARNING] Can not get rank info or profiling data.") + return + if data_type == Constant.INVALID: + print("[ERROR] The current folder contains both DB and other files. Please check.") + return + if self.analysis_mode not in COMM_FEATURE_LIST: + if data_type != Constant.DB: + print("[ERROR] The current analysis node only supports DB as input data. Please check.") + return + FileManager.create_output_dir(self.collection_path, is_overwrite=True) + params = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.DATA_MAP: data_map, + Constant.DATA_TYPE: data_type, + Constant.RECIPE_NAME: self.origin_params.get(Constant.RECIPE_NAME, ""), + Constant.RECIPE_CLASS: self.origin_params.get(Constant.RECIPE_CLASS), + Constant.PARALLEL_MODE: self.origin_params.get(Constant.PARALLEL_MODE, ""), + Constant.EXPORT_TYPE: self.origin_params.get(Constant.EXPORT_TYPE, "") + } + params.update(params[Constant.RECIPE_CLASS].get_extra_argument(self.origin_params)) + AnalysisFacade(params).recipe_analyze() + else: + FileManager.create_output_dir(self.collection_path) + params = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.DATA_MAP: data_map, + Constant.ANALYSIS_MODE: self.analysis_mode, + Constant.DATA_TYPE: data_type + } + comm_data_dict = CommunicationGroupGenerator(params).generate() + params[Constant.COMM_DATA_DICT] = comm_data_dict + AnalysisFacade(params).cluster_analyze() + + +def cluster_analysis_main(args=None): + parser = argparse.ArgumentParser(description="cluster analysis module") + parser.add_argument('-d', '--collection_path', type=str, required=True, help="profiling data path") + parser.add_argument('-m', '--mode', choices=ALL_FEATURE_LIST, + default='all', help="different analysis mode") + args_parsed, args_remained = parser.parse_known_args(args=args) + parameter = { + Constant.COLLECTION_PATH: args_parsed.collection_path, + Constant.ANALYSIS_MODE: args_parsed.mode + } + if args_parsed.mode in COMM_FEATURE_LIST: + if args_remained: + print(f"[ERROR] The specific argument {args_remained} is not supported for communication analysis.") + return + else: + parameter.update(parse_specific_params(args_parsed.mode, args_remained)) + Interface(parameter).run() + + +if __name__ == "__main__": + cluster_analysis_main() diff --git a/profiler/cluster_analyse_review/cluster_data_preprocess/__init__.py b/profiler/cluster_analyse_review/cluster_data_preprocess/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_data_preprocess/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/cluster_data_preprocess/data_preprocessor.py b/profiler/cluster_analyse_review/cluster_data_preprocess/data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..72d65ae6571e68564e46f43463843d1f46a3a69e --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_data_preprocess/data_preprocessor.py @@ -0,0 +1,41 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from abc import abstractmethod + + +class DataPreprocessor: + PROFILER_INFO_HEAD = 'profiler_info_' + PROFILER_INFO_EXTENSION = '.json' + + def __init__(self, path_list: list): + self.path_list = path_list + self.data_map = {} + + @abstractmethod + def get_data_map(self): + pass + + def get_rank_id(self, dir_name: str) -> int: + files = os.listdir(dir_name) + for file_name in files: + if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): + rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)] + try: + rank_id = int(rank_id_str) + except ValueError: + rank_id = -1 + return rank_id + return -1 diff --git a/profiler/cluster_analyse_review/cluster_data_preprocess/mindspore_data_preprocessor.py b/profiler/cluster_analyse_review/cluster_data_preprocess/mindspore_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..a3e09983ddb54b972a9e343c1661b5c8b2cbb8c8 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_data_preprocess/mindspore_data_preprocessor.py @@ -0,0 +1,41 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + +from cluster_data_preprocess.data_preprocessor import DataPreprocessor + + +class MindsporeDataPreprocessor(DataPreprocessor): + + def __init__(self, path_list: list): + super().__init__(path_list) + + def get_data_map(self) -> dict: + rank_id_map = defaultdict(list) + for dir_name in self.path_list: + rank_id = self.get_rank_id(dir_name) + if rank_id < 0: + print('[Error]fail to get rankid or rankid invalid.') + continue + rank_id_map[rank_id].append(dir_name) + + try: + for (rank_id, dir_list) in rank_id_map.items(): + dir_list.sort(key=lambda x: x.split('_')[-3]) + self.data_map[rank_id] = dir_list[0] + except Exception as e: + raise RuntimeError("Found invalid directory name!") from e + return self.data_map diff --git a/profiler/cluster_analyse_review/cluster_data_preprocess/pytorch_data_preprocessor.py b/profiler/cluster_analyse_review/cluster_data_preprocess/pytorch_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..55c3d03958b97c427fe8fde0625e72ea4dee8997 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_data_preprocess/pytorch_data_preprocessor.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import glob +from collections import defaultdict +import os + +from cluster_data_preprocess.data_preprocessor import DataPreprocessor +from common_func.constant import Constant +from common_func.file_manager import FileManager + + +class PytorchDataPreprocessor(DataPreprocessor): + + def __init__(self, path_list: list): + super().__init__(path_list) + self.data_type = set() + + def get_data_map(self) -> dict: + rank_id_map = defaultdict(list) + for dir_name in self.path_list: + rank_id = self.get_rank_id(dir_name) + if rank_id < 0: + print('[Error]fail to get rankid or rankid invalid.') + continue + for file_name in os.listdir(dir_name): + if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): + file_path = os.path.join(dir_name, file_name) + config = FileManager.read_json_file(file_path) + self.data_type.add(config.get(Constant.CONFIG, {}).get(Constant.EXPER_CONFIG, {}). + get(Constant.EXPORT_TYPE, Constant.TEXT)) + rank_id_map[rank_id].append(dir_name) + + try: + for (rank_id, dir_list) in rank_id_map.items(): + dir_list.sort(key=lambda x: x.split('_')[-3]) + self.data_map[rank_id] = dir_list[0] + except Exception as e: + raise RuntimeError("Found invalid directory name!") from e + return self.data_map + + def get_data_type(self): + if len(self.data_type) == 1: + return self.data_type.pop() + return Constant.INVALID diff --git a/profiler/cluster_analyse_review/cluster_kernels_analysis/README.md b/profiler/cluster_analyse_review/cluster_kernels_analysis/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f90f99fb9b3058d5ad67728b45da1c07f03e65e5 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_kernels_analysis/README.md @@ -0,0 +1,67 @@ +# 功能介绍 +集群场景下,多卡间的算子情况,只能通过查看每张卡各自的性能数据来了解,不能直观的对比各卡之间算子的性能差异。 +cluster_op_summary_analysis.py脚本基于多卡性能数据的op_summary信息,统计并展示各卡中执行最快、最慢、均值和方差的TopN算子。 + +## 交附件 +### cluster_op_time_ analysis.csv +将算子以op_name、input_shape、input_size、output_shape进行分类,统计每一类算子,在不同节点(node)的不同卡(device)上,执行时间的最大、最小、方差、平均时间以及范围。 +### xxx_info.html + +主要是各个特性(time和ratio)的html文件,以html方式展示top_n算子的箱线图。 + +time和ratio表示AI Core和AI Vector Core算子性能指标中的耗时和占比字段。 + +以html文件展示TopN算子执行耗时和占比的箱线图。 + +有TopN个算子就会有TopN个坐标系,每个坐标系表示一个算子的特性,以total_time的平均值从左向右依次向下排序。 + +- 横坐标:node_device表示第几个node的第几张卡,从小到大排序。 +- 纵坐标:时间。 +- 坐标名:在坐标下方,以op_name-input_shape拼接展示。 + +# 操作指导 + +1. 准备性能数据 + + 拷贝所有node上的性能数据到一个环境里,性能数据必须包含在node*目录下,例如当前集群场景为2机16卡,那么就是两个node分别有八个device,拷贝性能数据目录如下: + + ```bash + ├── node0 # 可以是node0或nodeo_xxx,表示某个节点 + │ ├── PROF_XXXXX # 单个device的性能数据,须完成msprof性能数据解析 + │ ├── SUMMARY + │ ├── op_summary_XX.csv + | ...... # 一共八张卡的性能数据 + ├── node1 # 可以是node1 或者node1_xxx表示某个节点 + │ ├── PROF_XXXXX # 单个device的profiling数据 + │ ├── SUMMARY + │ ├── op_summary_XX.csv # 用来做解析的op_summary表格 + | ...... + ``` + +2. 拷贝脚本准备环境 + + 将cluster_prof_Info_analysis.py脚本拷贝到一个文件夹里,并安装对应的Python库。 + + ```bash + pip install pandas + pip install ploty + ``` + +3. 运行脚本 + + ```bash + python3 cluster_prof_Info_analysis.py –d data_path -t type -n top_n + ``` + + - -d:集群场景性能数据目录,输入node的上一级目录。 + - -t:获取分析信息结果文件类型,可取值:html、csv、all,默认html。 + - -n:html分析独有,表示需要展示的是平均时间top_n的算子,默认10,配置超过30时需要一定时间。 + +异常情况处理: + +- -n参数必须大于0,如果输入<=0, 默认只导出一个算子的数据。 +- 配置-n参数值大于算子总数时,按等于算子数处理。 +- 部分没有op_summary的,不显示也不报错。 +- 目录下不存在op_summary时,执行报错无法找到数据文件。 +- op_summary列数据错误或读不到数据时,提示具体出错文件。 +- -t参数配置错误时,提示输入错误,并提示正确的配置。 diff --git a/profiler/cluster_analyse_review/cluster_kernels_analysis/__init__.py b/profiler/cluster_analyse_review/cluster_kernels_analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/cluster_analyse_review/cluster_kernels_analysis/cluster_prof_Info_analysis.py b/profiler/cluster_analyse_review/cluster_kernels_analysis/cluster_prof_Info_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..27e3c229c56d7c2a1afe6ae49d98c96b19bc55ff --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_kernels_analysis/cluster_prof_Info_analysis.py @@ -0,0 +1,327 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import argparse +import re +import os +import stat +import shutil +import warnings +from pathlib import Path + +import pandas as pd +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from plotly.offline import plot + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from common_func.path_manager import PathManager + + +MAX_READ_FILE_BYTES = 64 * 1024 * 1024 + + +class FormDataProcessor: + def __init__(self, path, form_name): + self.form_name = form_name + self.files = self.get_files_with_prefix_recursive(path, form_name) + + def get_files_with_prefix_recursive(self, csv_path, match_str): + matched_ir_files = list(Path(csv_path).rglob(match_str)) + if not matched_ir_files: + msg = f"Didn't find any file in folder {csv_path} that matches {match_str}" + raise RuntimeError(msg) + return [str(item) for item in matched_ir_files] + + def readSummaryData(self, columns_to_keep): + # 存储所有合并后的数据 + all_data = pd.DataFrame() + for f in self.files: + if "mindstudio_profiler_output" in f: + continue + # 判断csv文件大小 + PathManager.check_path_readable(f) + # 读取CSV文件 + df = pd.read_csv(f) + # 保留需要的列 + try: + df = df[columns_to_keep] + except KeyError: + print(f"{f}文件没有所需的列,请确认profiling数据的正确性:\n,以下列可能不存在{columns_to_keep}\n") + continue + # 从文件名提取设备ID + try: + df['device_id'] = self.getDeviceId(f) + except Exception: + print(f"文件 \"{f}\" 的路径或者是文件夹名没有按照要求,请确保存在[device_]这一级文件夹,具体操作指导见readme\n") + continue + # 添加新列 "device_id" + try: + df['node_id'] = self.getNodeId(f) + except Exception: + print(f"文件 \"{f}\" 的路径或者是文件夹名没有按照要求,请确保存在[node*]这一级文件夹,具体操作指导见readme\n") + continue + # 将数据添加到最终的数据框中 + all_data = pd.concat([all_data, df]) + return all_data + + def getChipType(self): + file = self.files[0] + df = pd.read_csv(file) + if 'aiv_time(us)' in df.columns: + return "ASCEND_NEW" + return "ASCEND_OTHER" + + def getDeviceId(self, dir_path): + device_id = re.search(r'device_(\d+)', dir_path).group(1) + return device_id + + def getNodeId(self, dir_path): + node_id = re.search(r'node(\d+)', dir_path).group(1) + return int(node_id) + + def getRankNum(self): + return len(self.files) + + +# 表驱动,获取不同芯片类型不同交付件的所需的列 +class ViewInfoManager: + def __init__(self, chip_type): + self.chip_type = chip_type + self.op_summary_columns_dict = {} + self.setOpSummaryColumnsParams() + + def setOpSummaryColumnsParams(self): + # 有些数据除了用表格的列进行分组之外,还添加了其他属性对数据进行分类,这部分数据放在extend_attr_to_group里面 + self.op_summary_columns_dict = { + 'ASCEND_NEW': { + 'TimeToCsvAnalyzer': + {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], + 'extend_attr_to_group': ["device_id", "node_id"], + 'columns_to_view': ["Task Duration(us)"], + 'calculate_fun': ['mean', 'var', 'max', 'min'] + }, + 'StatisticalInfoToHtmlAnalyzer': + {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], + "columns_to_view": ["Task Duration(us)", "aiv_time(us)", "aiv_vec_ratio", + "aiv_scalar_ratio", "aiv_mte2_ratio", "aiv_mte3_ratio", + "aicore_time(us)", "aic_mac_ratio", "aic_scalar_ratio", + "aic_mte1_ratio", "aic_mte2_ratio", "aic_fixpipe_ratio" + ], + 'calculate_fun': ['mean', 'var', 'max', 'min'] + } + }, + 'ASCEND_OTHER': { + 'TimeToCsvAnalyzer': + {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], + 'extend_attr_to_group': ["device_id", "node_id"], + "columns_to_view": ["Task Duration(us)"], + 'calculate_fun': ['mean', 'var', 'max', 'min'] + }, + 'StatisticalInfoToHtmlAnalyzer': + {'columns_to_group': ["Op Name", "Input Shapes", "Input Data Types", "Output Shapes"], + "columns_to_view": ["aicore_time(us)", "Task Duration(us)", "mac_ratio", "vec_ratio", + "scalar_ratio", "mte1_ratio", "mte2_ratio", "mte3_ratio"], + 'calculate_fun': ['mean', 'var', 'max', 'min'] + } + } + } + + def getColumnsInfo(self, analyzer_type): + return self.op_summary_columns_dict.get(self.chip_type, {}).get(analyzer_type) + + +class OpSummaryAnalyzerBase: + def __init__(self, chip_type, analyzer_type, dir_path): + self.chip_type = chip_type + view_info = ViewInfoManager(chip_type).getColumnsInfo(analyzer_type) + self.columns_to_view = view_info['columns_to_view'] + self.calculate_fun = view_info['calculate_fun'] + self.columns_to_group = view_info['columns_to_group'] + self.attrs_to_group = self.columns_to_group.copy() + if 'extend_attr_to_group' in view_info: + extend_attr_to_group = view_info['extend_attr_to_group'] + self.attrs_to_group.extend(extend_attr_to_group) + # 创建结果文件 + self.result_dir = os.path.join(dir_path, "result") + PathManager.check_path_length(self.result_dir) + if os.path.exists(self.result_dir): + shutil.rmtree(self.result_dir, onerror=self.on_rm_error) + PathManager.check_path_writeable(dir_path) + PathManager.make_dir_safety(self.result_dir) + + def getColumnsToGroup(self): + return self.columns_to_group + + def getColumnsToView(self): + return self.columns_to_view + + def calculateViewData(self, summary_data): + # 存储所有合并后的数据 + calculate_dict = {self.columns_to_view[i]: self.calculate_fun for i in range(len(self.columns_to_view))} + view_data = summary_data.groupby(self.attrs_to_group).agg(calculate_dict).reset_index() + return view_data + + def on_rm_error(self, func, path, exc_info): + # path contains the path of the file that couldn't be removed + # let's just assume that it's read-only and unlink it. + os.chmod(path, stat.S_IWRITE) + os.unlink(path) + + +class TimeToCsvAnalyzer(OpSummaryAnalyzerBase): + def __init__(self, chip_type, dir_path): + super().__init__(chip_type, "TimeToCsvAnalyzer", dir_path) + + def GenerateDeliverable(self, summary_data, rank_num): + view_data = self.calculateViewData(summary_data) + # 规范化列名 + view_data.columns = [''.join(col) if col[1] == "" else '_'.join(col) for col in view_data.columns] + try: + for column in self.columns_to_view: + view_data[column + '_range'] = view_data[column + '_max'] - view_data[column + '_min'] + except Exception as e: + raise RuntimeError("Invalid view data!") from e + save_path = os.path.join(self.result_dir, "cluster_duration_time_analysis.csv") + PathManager.check_path_length(save_path) + view_data.to_csv(save_path, index=False) + # 该文件权限设置为只读权限,不允许修改 + os.chmod(save_path, stat.S_IROTH) + return view_data + + +class StatisticalInfoToHtmlAnalyzer(OpSummaryAnalyzerBase): + def __init__(self, chip_type, top_n, dir_path): + super().__init__(chip_type, "StatisticalInfoToHtmlAnalyzer", dir_path) + self.top_n = top_n + # top_n 如果不符合要求,报警告 + + def GenerateDeliverable(self, summary_data, rank_num): + view_data = self.calculateViewData(summary_data) + # 规范化列名 op_name/ --> op_name time/var 这种不变 + view_data.columns = [''.join(col) if col[1] == "" else col for col in view_data.columns] + + # 对使用到的变量进行初始设置 + self.top_n = min(max(self.top_n, 1), len(view_data)) + top_n_data = view_data.sort_values(("Task Duration(us)", 'var'), ascending=False).head(self.top_n) + + for column in self.columns_to_view: + # 分别给每一种特性画图 + self.drawPloty(column, summary_data, top_n_data, rank_num) + + def drawPloty(self, column, summary_data, top_n_data, rank_num): + col_num = self.getCalNum(rank_num) + row_num = self.top_n // col_num if self.top_n % col_num == 0 else (self.top_n + 1) // col_num + fig = make_subplots(rows=row_num, cols=col_num, vertical_spacing=0.03) + for i, (_, operation) in enumerate(top_n_data.iterrows()): + op_data = summary_data[(summary_data["Op Name"] == operation["Op Name"]) & + (summary_data["Input Shapes"] == operation["Input Shapes"]) & + (summary_data["Input Data Types"] == operation["Input Data Types"])] + op_data = op_data.sort_values(by=["node_id", "device_id"]) + node_ids = op_data['node_id'].unique() + device_ids = op_data['device_id'].unique() + + for node_id in node_ids: + for device_id in device_ids: + draw_data = op_data[(op_data['node_id'] == node_id) & (op_data['device_id'] == device_id)] + fig.add_trace(go.Box(y=draw_data[column], + name=f'{node_id}_{device_id}', + marker_color='green', showlegend=False), (i // col_num) + 1, (i % col_num) + 1) + + fig.update_xaxes(title_text=f'{operation["Op Name"]}-{operation["Input Shapes"]}', row=(i // col_num) + 1, + col=(i % col_num) + 1) + fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), + height=int(500 * row_num), + width=int(rank_num * 100 * col_num), + title_text="Op Performance Comparison") + save_plot_path = os.path.join(self.result_dir, column + "_Info.html") + PathManager.check_path_length(save_plot_path) + plot(fig, filename=save_plot_path) + # 该文件权限设置为只读权限,不允许修改 + os.chmod(save_plot_path, stat.S_IROTH) + + def getCalNum(self, rank_num): + # 计算每行应该画多少个子图 + if rank_num <= 16: + return 2 + else: + return 1 + + +class DeliverableGenerator: + def __init__(self, params): + self.dirs = params.get('dir') + self.formProcess = FormDataProcessor(self.dirs, 'op_summary*.csv') + self.analyzers = [] + self.columns_to_keep = [] + self.setAnalyzers(params) + self.setColumnsToKeep() + + def run(self): + summary_data = self.formProcess.readSummaryData(self.columns_to_keep) + # 判断summarydata 数据是否为空,如果是空, 说明所有csv读取数据都失败了 + if summary_data.empty: + print("没有符合要求的csv表格数据,请排查您的PROFILING数据") + return + rank_num = self.formProcess.getRankNum() + for analyzer in self.analyzers: + analyzer.GenerateDeliverable(summary_data, rank_num) + + def setAnalyzers(self, params): + chip_type = self.formProcess.getChipType() + # 判断该路径是不是软链接,并修改为绝对路径 + if os.path.islink(params.get('dir')): + print(f"The file: \"{params.get('dir')}\" is link. Please check the path.") + return + prof_path = os.path.realpath(params.get('dir')) + PathManager.input_path_common_check(prof_path) + if params.get('type') == "all": + self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path), StatisticalInfoToHtmlAnalyzer(chip_type, params.get("top_n"), prof_path)] + elif params.get('type') == "html": + self.analyzers = [StatisticalInfoToHtmlAnalyzer(chip_type, params.get("top_n"), prof_path)] + elif params.get('type') == "csv": + self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path)] + else: + warnings.warn("参数错误,请输入 all html csv 这三种类型") # 发出一个警告信息 + + + def setColumnsToKeep(self): + columns_to_keep = [] + for analyzer in self.analyzers: + columns_to_keep.extend(analyzer.getColumnsToGroup()) + columns_to_keep.extend(analyzer.getColumnsToView()) + self.columns_to_keep = list(set(columns_to_keep)) + + +def main(): + # 解析命令行参数 + parser = argparse.ArgumentParser() + parser.add_argument("--dir", "-d", default=None, help="root dir of PROF_* data") + parser.add_argument("--top_n", "-n", default=10, help="how many operators to show", type=int) + parser.add_argument("--type", "-t", default='html', help="compare ratio or aicore-time", type=str) + args = parser.parse_args() + params = { + "dir": args.dir, + "top_n": args.top_n, + "type": args.type + } + + deviverable_gen = DeliverableGenerator(params) + deviverable_gen.run() + +if __name__ == "__main__": + main() diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/__init__.py b/profiler/cluster_analyse_review/cluster_statistics_export/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/cann_api_sum_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/cann_api_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..578ee937be57ff8615085bbe1e4ac6ccae81a4e9 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/cann_api_sum_export.py @@ -0,0 +1,65 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + +QUERY = """ +WITH + summary as ( + SELECT + name, + sum(endNs - startNs) AS duration, + count (*) AS num, + avg(endNs - startNs) AS avg_duration, + min(endNs - startNs) AS min_duration, + median(endNs - startNs) AS med_duration, + max(endNs - startNs) AS max_duration, + stdev(endNs - startNs) AS stdev_duration, + lower_quartile(endNs - startNs) AS lower_quartile_duration, + upper_quartile(endNs - startNs) AS upper_quartile_duration + FROM + CANN_API + GROUP BY name + ), + totals AS ( + SELECT sum(duration) AS total + FROM summary + ) +SELECT + ids.value AS "name", + round(summary.duration * 100.0 / (SELECT total FROM totals), 2) AS "durationRatio", + summary.duration AS "totalTimeNs", + summary.num AS "totalCount", + round(summary.avg_duration, 1) AS "averageNs", + round(summary.min_duration, 1) AS "minNs", + round(summary.lower_quartile_duration, 1) AS "Q1Ns", + round(summary.med_duration, 1) AS "medNs", + round(summary.upper_quartile_duration, 1) AS "Q3Ns", + round(summary.max_duration, 1) AS "maxNs", + round(summary.stdev_duration, 1) AS "stdev" +FROM + summary +LEFT JOIN + STRING_IDS AS ids + ON ids.id == summary.name +ORDER BY 2 DESC; + """ + + +class CannApiSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/compute_op_sum_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/compute_op_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..d70c696100bc305f8b1e182f7b1f915cf58f274a --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/compute_op_sum_export.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + NAME_IDS.value AS "OpName", + OPTYPE_IDS.value AS "OpType", + TASKTYPE_IDS.value AS "TaskType", + INPUTSHAPES_IDS.value AS "InputShapes", + round(TASK.endNs - TASK.startNs) AS "Duration" +FROM + COMPUTE_TASK_INFO +LEFT JOIN TASK + ON TASK.globalTaskId == COMPUTE_TASK_INFO.globalTaskId +LEFT JOIN + STRING_IDS AS NAME_IDS + ON NAME_IDS.id == COMPUTE_TASK_INFO.name +LEFT JOIN + STRING_IDS AS OPTYPE_IDS + ON OPTYPE_IDS.id == COMPUTE_TASK_INFO.opType +LEFT JOIN + STRING_IDS AS TASKTYPE_IDS + ON TASKTYPE_IDS.id == COMPUTE_TASK_INFO.taskType +LEFT JOIN + STRING_IDS AS INPUTSHAPES_IDS + ON INPUTSHAPES_IDS.id == COMPUTE_TASK_INFO.inputShapes + """ + + +class ComputeOpSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/hccl_sum_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/hccl_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..f695949de1a92e9a1faff593bc45e52f91582242 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/hccl_sum_export.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + NAME_IDS.value AS "OpName", + TYPE_IDS.value AS "OpType", + round(endNs - startNs) AS "Duration" +FROM + COMMUNICATION_OP +LEFT JOIN + STRING_IDS AS TYPE_IDS + ON TYPE_IDS.id == COMMUNICATION_OP.opType +LEFT JOIN + STRING_IDS AS NAME_IDS + ON NAME_IDS.id == COMMUNICATION_OP.opName + """ + + +class HcclSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/mstx_mark_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/mstx_mark_export.py new file mode 100644 index 0000000000000000000000000000000000000000..ac5355c020042d474963296242b79eb3fd6a8c38 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/mstx_mark_export.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +WITH + FRAMEWORK_API AS ( + SELECT + PYTORCH_API.startNs, + CONNECTION_IDS.connectionId + FROM + PYTORCH_API + LEFT JOIN + CONNECTION_IDS + ON PYTORCH_API.connectionId == CONNECTION_IDS.id + ) +SELECT + MSG_IDS.value AS "msg", + MSTX_EVENTS.startNs AS "cann_ts", + TASK.startNs AS "device_ts", + FRAMEWORK_API.startNs AS "framework_ts", + MSTX_EVENTS.globalTid AS "tid" +FROM + MSTX_EVENTS +LEFT JOIN + TASK + ON MSTX_EVENTS.connectionId == TASK.connectionId +LEFT JOIN + FRAMEWORK_API + ON MSTX_EVENTS.connectionId == FRAMEWORK_API.connectionId +LEFT JOIN + STRING_IDS AS MSG_IDS + ON MSTX_EVENTS.message == MSG_IDS.id +ORDER BY + MSTX_EVENTS.startNs + """ + + +class MstxMarkExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/mstx_step_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/mstx_step_export.py new file mode 100644 index 0000000000000000000000000000000000000000..c257ce675fe46ea0f7eff2489dd2fe13c846564f --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/mstx_step_export.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + id AS "step_id", + startNs AS "start_ns", + endNs AS "end_ns" +FROM + STEP_TIME +ORDER BY + startNs + """ + + +class MstxStepExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse_review/cluster_statistics_export/stats_export.py b/profiler/cluster_analyse_review/cluster_statistics_export/stats_export.py new file mode 100644 index 0000000000000000000000000000000000000000..e6d98f48ef8c4e8032f7611dac163ead3cc5fbe0 --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_statistics_export/stats_export.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from common_func.db_manager import DBManager +from common_func.constant import Constant + + +class StatsExport: + + def __init__(self, db_path, analysis_class): + self._db_path = db_path + self._analysis_class = analysis_class + self._query = None + + def get_query(self): + return self._query + + def read_export_db(self): + query = self.get_query() + if query is None: + print(f"[ERROR] query is None.") + return + conn, cursor = DBManager.create_connect_db(self._db_path, Constant.ANALYSIS) + data = pd.read_sql(query, conn) + DBManager.destroy_db_connect(conn, cursor) + return data diff --git a/profiler/cluster_analyse_review/cluster_utils/__init__.py b/profiler/cluster_analyse_review/cluster_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/cluster_analyse_review/cluster_utils/data_transfer_adapter.py b/profiler/cluster_analyse_review/cluster_utils/data_transfer_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..1f306415fa789ae0dab7d8751b1c240b3433de0d --- /dev/null +++ b/profiler/cluster_analyse_review/cluster_utils/data_transfer_adapter.py @@ -0,0 +1,142 @@ +import copy + +from common_func.constant import Constant +from common_func.table_constant import TableConstant + + +class DataTransferAdapter(object): + COMM_TIME_TABLE_COLUMN = [TableConstant.START_TIMESTAMP, TableConstant.ELAPSED_TIME, TableConstant.TRANSIT_TIME, + TableConstant.WAIT_TIME, TableConstant.SYNCHRONIZATION_TIME, TableConstant.IDLE_TIME, + TableConstant.SYNCHRONIZATION_TIME_RATIO, TableConstant.WAIT_TIME_RATIO] + COMM_TIME_JSON_COLUMN = [Constant.START_TIMESTAMP, Constant.ELAPSE_TIME_MS, Constant.TRANSIT_TIME_MS, + Constant.WAIT_TIME_MS, Constant.SYNCHRONIZATION_TIME_MS, Constant.IDLE_TIME_MS, + Constant.SYNCHRONIZATION_TIME_RATIO, Constant.WAIT_TIME_RATIO] + MATRIX_TABLE_COLUMN = [TableConstant.TRANSIT_SIZE, TableConstant.TRANSIT_TIME, TableConstant.BANDWIDTH, + TableConstant.TRANSPORT_TYPE, TableConstant.OPNAME] + MATRIX_JSON_COLUMN = [Constant.TRANSIT_SIZE_MB, Constant.TRANSIT_TIME_MS, Constant.BANDWIDTH_GB_S, + Constant.TRANSPORT_TYPE, Constant.OP_NAME] + COMM_BD_TABLE_COLUMN = [TableConstant.TRANSIT_SIZE, TableConstant.TRANSIT_TIME, TableConstant.BANDWIDTH, + TableConstant.LARGE_PACKET_RATIO] + COMM_BD_JSON_COLUMN = [Constant.TRANSIT_SIZE_MB, Constant.TRANSIT_TIME_MS, Constant.BANDWIDTH_GB_S, + Constant.LARGE_PACKET_RATIO] + + def __init__(self): + super().__init__() + + def transfer_comm_from_db_to_json(self, time_info: list, bandwidth_info: list): + result = {} + if not time_info and not bandwidth_info: + return result + for time_data in time_info: + comm_time = dict() + hccl_name = time_data[TableConstant.HCCL_OP_NAME] + "@" + time_data[TableConstant.GROUP_NAME] + for key, value in dict(zip(self.COMM_TIME_JSON_COLUMN, self.COMM_TIME_TABLE_COLUMN)).items(): + if not key.endswith("ratio"): + comm_time[key] = time_data.get(value, 0) + result.setdefault(time_data[TableConstant.STEP], {}).setdefault(time_data[TableConstant.TYPE], {}). \ + setdefault(hccl_name, {})[Constant.COMMUNICATION_TIME_INFO] = comm_time + hccl_set = set() + for bd_data in bandwidth_info: + hccl_name = bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME] + hccl_set.add(hccl_name) + for hccl in hccl_set: + comm_bd = dict() + for bd_data in bandwidth_info: + if hccl == (bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]): + temp_dict = dict() + key_dict = dict(zip(self.COMM_BD_JSON_COLUMN, self.COMM_BD_TABLE_COLUMN)) + self.set_value_by_key(temp_dict, bd_data, key_dict) + comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], temp_dict).setdefault( + Constant.SIZE_DISTRIBUTION, {})[bd_data[TableConstant.PACKAGE_SIZE]] = \ + [bd_data[TableConstant.COUNT], bd_data[TableConstant.TOTAL_DURATION]] + result.setdefault(bd_data[TableConstant.STEP], {}).setdefault(bd_data[TableConstant.TYPE], {}). \ + setdefault(hccl, {})[Constant.COMMUNICATION_BANDWIDTH_INFO] = comm_bd + return result + + def transfer_comm_from_json_to_db(self, res_data: dict): + res_comm_data, res_bd_data = list(), list() + + def split_comm_time(): + for rank_id, comm_data in op_data.items(): + time_data = comm_data.get(Constant.COMMUNICATION_TIME_INFO) + res_time = set_only_value(rank_id) + for key, value in dict(zip(self.COMM_TIME_TABLE_COLUMN, self.COMM_TIME_JSON_COLUMN)).items(): + res_time[key] = time_data.get(value, 0) + res_comm_data.append(res_time) + bd_data = comm_data.get(Constant.COMMUNICATION_BANDWIDTH_INFO, {}) + for transport_type, data in bd_data.items(): + res_bandwidth = set_only_value(rank_id) + key_dict = dict(zip(self.COMM_BD_TABLE_COLUMN, self.COMM_BD_JSON_COLUMN)) + res_bandwidth[TableConstant.TRANSPORT_TYPE] = transport_type + self.set_value_by_key(res_bandwidth, data, key_dict) + for key, value in data.get(Constant.SIZE_DISTRIBUTION, {}).items(): + res_bandwidth[TableConstant.PACKAGE_SIZE] = key + res_bandwidth[TableConstant.COUNT] = value[0] + res_bandwidth[TableConstant.TOTAL_DURATION] = value[1] + temp_dict = copy.deepcopy(res_bandwidth) + res_bd_data.append(temp_dict) + + def set_only_value(rank_id): + res_dict = dict() + res_dict[TableConstant.RANK_SET] = str(rank_set) + res_dict[TableConstant.STEP] = step + res_dict[TableConstant.RANK_ID] = rank_id + res_dict[TableConstant.HCCL_OP_NAME] = op_name.split("@")[0] if "@" in op_name else op_name + res_dict[TableConstant.GROUP_NAME] = op_name.split("@")[1] if "@" in op_name else "" + return res_dict + + for rank_set, step_dict in res_data.items(): + for step, op_dict in step_dict.items(): + for op_name, op_data in op_dict.items(): + split_comm_time() + return res_comm_data, res_bd_data + + def set_value_by_key(self, src_dict, dst_dict, key_dict): + for key, value in key_dict.items(): + src_dict[key] = dst_dict.get(value, 0) + + def transfer_matrix_from_db_to_json(self, matrix_data: list): + result = {} + if not matrix_data: + return result + hccl_set = set() + for data in matrix_data: + hccl = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME] + hccl_set.add(hccl) + for hccl in hccl_set: + for data in matrix_data: + if hccl == (data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]): + key = data[TableConstant.SRC_RANK] + '-' + data[TableConstant.DST_RANK] + temp_dict = dict() + key_dict = dict(zip(self.MATRIX_JSON_COLUMN, self.MATRIX_TABLE_COLUMN)) + self.set_value_by_key(temp_dict, data, key_dict) + result.setdefault(data[TableConstant.STEP], {}).setdefault(data[TableConstant.TYPE], {}). \ + setdefault(hccl, {}).setdefault(key, temp_dict) + return result + + def transfer_matrix_from_json_to_db(self, res_data: dict): + result = list() + + def split_matrix_data(): + for op_name, op_data in op_dict.items(): + for link_key, link_data in op_data.items(): + if "@" in op_name: + hccl_op_name, group_name = op_name.split("@")[0], op_name.split("@")[1] + else: + hccl_op_name, group_name = op_name, "" + matrix_data = { + TableConstant.RANK_SET: str(rank_set), + TableConstant.STEP: step, + TableConstant.HCCL_OP_NAME: hccl_op_name, + TableConstant.GROUP_NAME: group_name, + TableConstant.SRC_RANK: link_key.split("-")[0], + TableConstant.DST_RANK: link_key.split("-")[1] + } + key_dict = dict(zip(self.MATRIX_TABLE_COLUMN, self.MATRIX_JSON_COLUMN)) + self.set_value_by_key(matrix_data, link_data, key_dict) + result.append(matrix_data) + + for rank_set, step_dict in res_data.items(): + for step, op_dict in step_dict.items(): + split_matrix_data() + return result diff --git a/profiler/cluster_analyse_review/common_func/__init__.py b/profiler/cluster_analyse_review/common_func/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/common_func/analysis_loader.py b/profiler/cluster_analyse_review/common_func/analysis_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..55e7dbc6ea930de7a47799384ffad5daa1328da2 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/analysis_loader.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect +import sys + +from common_func.constant import Constant +from analysis.base_analysis import BaseRecipeAnalysis + +def is_analysis_class(obj): + return inspect.isclass(obj) and issubclass(obj, BaseRecipeAnalysis) and obj != BaseRecipeAnalysis + +def get_class_from_name(analysis_name : str): + sys.path.append(Constant.ANALYSIS_PATH) + analysis_path = f"analysis.{analysis_name}.{analysis_name}" + module = None + try: + module = importlib.import_module(analysis_path) + except Exception as e: + print(f"[ERROR] {analysis_path} not find:{e}") + + specific_analysis = inspect.getmembers(module, is_analysis_class) + if not specific_analysis: + print(f"[ERROR] {analysis_name} not found.") + return specific_analysis[0] diff --git a/profiler/cluster_analyse_review/common_func/constant.py b/profiler/cluster_analyse_review/common_func/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..80f0374c1d1d9a37204b9583112ce5baa4cf3e95 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/constant.py @@ -0,0 +1,118 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +class Constant(object): + # dir name + FRAMEWORK_DIR = "FRAMEWORK" + CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output" + SINGLE_OUTPUT = "ASCEND_PROFILER_OUTPUT" + COMM_JSON = "communication.json" + COMM_MATRIX_JSON = "communication_matrix.json" + STEP_TIME_CSV = "step_trace_time.csv" + KERNEL_DETAILS_CSV = "kernel_details.csv" + + # file authority + FILE_AUTHORITY = 0o640 + DIR_AUTHORITY = 0o750 + MAX_JSON_SIZE = 1024 * 1024 * 1024 * 10 + MAX_CSV_SIZE = 1024 * 1024 * 1024 * 5 + MAX_PATH_LENGTH = 4096 + MAX_READ_DB_FILE_BYTES = 1024 * 1024 * 1024 * 8 + + # communication + P2P = "p2p" + COLLECTIVE = "collective" + STEP_ID = "step_id" + RANK_ID = "rank_id" + GROUP_NAME = "group_name" + COMM_OP_TYPE = "comm_op_type" + COMM_OP_NAME = "comm_op_name" + COMM_OP_INFO = "comm_op_info" + TOTAL_OP_INFO = "Total Op Info" + COMMUNICATION_TIME_INFO = "Communication Time Info" + START_TIMESTAMP = "Start Timestamp(us)" + COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info" + HCOM_SEND = "hcom_send" + HCOM_RECEIVE = "hcom_receive" + SYNCHRONIZATION_TIME_RATIO = "Synchronization Time Ratio" + SYNCHRONIZATION_TIME_MS = "Synchronization Time(ms)" + WAIT_TIME_RATIO = "Wait Time Ratio" + TRANSIT_TIME_MS = "Transit Time(ms)" + TRANSIT_SIZE_MB = "Transit Size(MB)" + SIZE_DISTRIBUTION = "Size Distribution" + WAIT_TIME_MS = "Wait Time(ms)" + OP_NAME = "Op Name" + BANDWIDTH_GB_S = "Bandwidth(GB/s)" + COMMUNICATION = "communication.json" + ELAPSE_TIME_MS = "Elapse Time(ms)" + IDLE_TIME_MS = "Idle Time(ms)" + LARGE_PACKET_RATIO = "Large Packet Ratio" + + # params + DATA_MAP = "data_map" + COLLECTIVE_GROUP = "collective_group" + COMMUNICATION_OPS = "communication_ops" + MATRIX_OPS = "matrix_ops" + COLLECTION_PATH = "collection_path" + COMMUNICATION_GROUP = "communication_group" + TRANSPORT_TYPE = "Transport Type" + COMM_DATA_DICT = "comm_data_dict" + DATA_TYPE = "data_type" + ANALYSIS_MODE = "analysis_mode" + + # step time + RANK = "rank" + STAGE = "stage" + + # epsilon + EPS = 1e-15 + + # file suffix + JSON_SUFFIX = ".json" + CSV_SUFFIX = ".csv" + + # result files type + TEXT = "text" + DB = "db" + INVALID = "invalid" + + # db name + DB_COMMUNICATION_ANALYZER = "analysis.db" + DB_CLUSTER_COMMUNICATION_ANALYZER = "cluster_analysis.db" + + # db tables + TABLE_COMM_ANALYZER_BANDWIDTH = "CommAnalyzerBandwidth" + TABLE_COMM_ANALYZER_TIME = "CommAnalyzerTime" + TABLE_COMM_ANALYZER_MATRIX = "CommAnalyzerMatrix" + TABLE_STEP_TRACE = "StepTraceTime" + TABLE_HOST_INFO = "HostInfo" + TABLE_RANK_DEVICE_MAP = "RankDeviceMap" + + # data config key + CONFIG = "config" + EXPER_CONFIG = "experimental_config" + EXPORT_TYPE = "_export_type" + + # recipe config + ANALYSIS = "analysis" + RECIPE_NAME = "recipe_name" + RECIPE_CLASS = "recipe_class" + PARALLEL_MODE = "parallel_mode" + CLUSTER_CUSTOM_ANALYSE_PATH = os.path.abspath(os.path.dirname(__file__)) + ANALYSIS_PATH = os.path.join(CLUSTER_CUSTOM_ANALYSE_PATH, 'analysis') + + CONCURRENT_MODE = "concurrent" \ No newline at end of file diff --git a/profiler/cluster_analyse_review/common_func/context.py b/profiler/cluster_analyse_review/common_func/context.py new file mode 100644 index 0000000000000000000000000000000000000000..4e3d544d3769e0c1360790dc1a4c57ca484687b8 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/context.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from functools import partial +from concurrent import futures +from common_func.constant import Constant + + +class Context(object): + """abstract base class""" + + ctx_map = None + + @classmethod + def create_context(cls, mode=Constant.CONCURRENT_MODE): + if cls.ctx_map is None: + keys = [Constant.CONCURRENT_MODE] + values = [ConcurrentContext] + cls.ctx_map = dict(zip(keys, values)) + + if mode not in cls.ctx_map: + raise NotImplementedError("mode must be in {}".format(keys)) + + return cls.ctx_map[mode]() + + def __init__(self): + print("[INFO] context {} initialized.".format(self._mode)) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + if exc_type is not None: + print(f"[ERROR] Failed to exit context: {exc_val}") + + def launch(self, func, *args, **kwargs): + raise NotImplementedError + + def map(self, func, *iterables, **kwargs): + raise NotImplementedError + + def wait(self, waitable): + raise NotImplementedError + +class ConcurrentContext(Context): + + def __init__(self, executor=None): + self._mode = Constant.CONCURRENT_MODE + super().__init__() + self._custom = executor is None + self._executor = executor or futures.ProcessPoolExecutor(max_workers=os.cpu_count()) + + def __enter__(self): + if self._executor is None: + raise RuntimeError("executor is None") + return self + + def close(self): + if self._custom: + self._executor.shutdown(wait=True) + self._executor = None + + def launch(self, func, *args, **kwargs): + return self._executor.submit(func, *args, **kwargs).result() + + def map(self, func, *iterables, **kwargs): + partial_func = partial(func, **kwargs) + return list(self._executor.map(partial_func, *iterables)) + + def wait(self, waitable): + return waitable \ No newline at end of file diff --git a/profiler/cluster_analyse_review/common_func/db_manager.py b/profiler/cluster_analyse_review/common_func/db_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..c0d6ad89be8edd8bbb2a4ee8e0653141550b0129 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/db_manager.py @@ -0,0 +1,233 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sqlite3 + +from common_func.constant import Constant +from common_func.empty_class import EmptyClass +from common_func.file_manager import check_db_path_valid +from common_func.tables_config import TablesConfig +from common_func.sql_extention_func import SqlExtentionAggregateFunc + +class DBManager: + """ + class to manage DB operation + """ + FETCH_SIZE = 10000 + INSERT_SIZE = 10000 + MAX_ROW_COUNT = 100000000 + + @staticmethod + def create_connect_db(db_path: str, mode=None) -> tuple: + """ + create and connect database + """ + if check_db_path_valid(db_path, is_create=True): + try: + conn = sqlite3.connect(db_path) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return EmptyClass("empty conn"), EmptyClass("empty curs") + try: + if mode == Constant.ANALYSIS: + try: + for func_name, params_count, class_name in SqlExtentionAggregateFunc: + conn.create_aggregate(func_name, params_count, class_name) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + if isinstance(conn, sqlite3.Connection): + curs = conn.cursor() + os.chmod(db_path, Constant.FILE_AUTHORITY) + return conn, curs + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return EmptyClass("empty conn"), EmptyClass("empty curs") + return EmptyClass("empty conn"), EmptyClass("empty curs") + + @staticmethod + def destroy_db_connect(conn: any, curs: any) -> None: + """ + destroy db connection + """ + try: + if isinstance(curs, sqlite3.Cursor): + curs.close() + except sqlite3.Error as err: + print(f"[ERROR] {err}") + try: + if isinstance(conn, sqlite3.Connection): + conn.close() + except sqlite3.Error as err: + print(f"[ERROR] {err}") + + @staticmethod + def judge_table_exists(curs: any, table_name: str) -> any: + """ + judge table exists + """ + if not isinstance(curs, sqlite3.Cursor): + return False + try: + curs.execute("select count(*) from sqlite_master where type='table' and name=?", (table_name,)) + return curs.fetchone()[0] + except sqlite3.Error as err: + print("[ERROR] {}".format(err)) + return False + + @staticmethod + def sql_generate_table(table_map: str): + header_with_type_begin = "(" + header_with_type_end = ")" + header_with_type_list = [] + if table_map in TablesConfig.DATA: + items = TablesConfig.DATA[table_map] + for item in items: + if item[0] == "index": + header_with_type_list.append('"' + item[0] + '" ' + item[1].split(",")[0]) + else: + header_with_type_list.append(item[0] + ' ' + item[1].split(",")[0]) + header_with_type_begin += ",".join(header_with_type_list) + header_with_type_begin += header_with_type_end + return header_with_type_begin + return "" + + @classmethod + def check_tables_in_db(cls, db_path: any, *tables: any) -> bool: + if check_db_path_valid(db_path): + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return False + res = True + for table in tables: + if not cls.judge_table_exists(curs, table): + res = False + break + cls.destroy_db_connect(conn, curs) + return res + return False + + @classmethod + def create_tables(cls, db_path: any, *tables: any): + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return + for table_name in tables: + if cls.judge_table_exists(curs, table_name): + drop_sql = "drop table {0}".format(table_name) + cls.execute_sql(conn, drop_sql) + table_map = "{0}Map".format(table_name) + header_with_type = cls.sql_generate_table(table_map) + sql = "CREATE TABLE IF NOT EXISTS " + table_name + header_with_type + cls.execute_sql(conn, sql) + cls.destroy_db_connect(conn, curs) + + @classmethod + def get_table_column_count(cls, db_path: any, table: any) -> int: + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return 0 + sql = "SELECT COUNT(*) FROM pragma_table_info('{}')".format(table) + res = 0 + try: + curs.execute(sql) + res = curs.fetchone()[0] + except sqlite3.Error as err: + print("[ERROR] {}".format(err)) + finally: + cls.destroy_db_connect(conn, curs) + return res + + @staticmethod + def execute_sql(conn: any, sql: str, params: any = None) -> bool: + """ + execute sql + """ + try: + if isinstance(conn, sqlite3.Connection): + if params: + conn.cursor().execute(sql, params) + else: + conn.cursor().execute(sql) + conn.commit() + return True + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return False + print("[ERROR] conn is invalid param") + return False + + @staticmethod + def executemany_sql(conn: any, sql: str, params: any) -> bool: + """ + execute many sql once + """ + try: + if isinstance(conn, sqlite3.Connection): + conn.cursor().executemany(sql, params) + conn.commit() + return True + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return False + print("[ERROR] conn is invalid param") + return False + + @classmethod + def fetch_all_data(cls: any, curs: any, sql: str, param: tuple = None, is_dict: bool = True) -> list: + """ + fetch 10000 num of data from db each time to get all data + """ + if not isinstance(curs, sqlite3.Cursor): + return [] + data = [] + try: + if param: + res = curs.execute(sql, param) + else: + res = curs.execute(sql) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + curs.row_factory = None + return [] + try: + description = res.description + while True: + res = curs.fetchmany(cls.FETCH_SIZE) + if is_dict: + data += CustomizedDictFactory.generate_dict_from_db(res, description) + else: + data += res + if len(data) > cls.MAX_ROW_COUNT: + print("[WARRING] The records count in the table exceeds the limit!") + if len(res) < cls.FETCH_SIZE: + break + return data + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return [] + finally: + curs.row_factory = None + + +class CustomizedDictFactory: + @staticmethod + def generate_dict_from_db(data_result: any, description: any) -> any: + description_set = [i[0] for i in description] + res = [] + for data in data_result: + data_dict = dict(zip(description_set, data)) + res.append(data_dict) + return res diff --git a/profiler/cluster_analyse_review/common_func/empty_class.py b/profiler/cluster_analyse_review/common_func/empty_class.py new file mode 100644 index 0000000000000000000000000000000000000000..df100d156fa064cca4514260db0b2e843e217d09 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/empty_class.py @@ -0,0 +1,20 @@ +class EmptyClass: + + def __init__(self: any, info: str = "") -> None: + self._info = info + + @classmethod + def __bool__(cls: any) -> bool: + return False + + @classmethod + def __str__(cls: any) -> str: + return "" + + @property + def info(self: any) -> str: + return self._info + + @staticmethod + def is_empty() -> bool: + return True diff --git a/profiler/cluster_analyse_review/common_func/file_manager.py b/profiler/cluster_analyse_review/common_func/file_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..e7e2d5adca37faf5b377bcbe720fdfba84311eca --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/file_manager.py @@ -0,0 +1,131 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import csv +import json + +from common_func.constant import Constant +from common_func.path_manager import PathManager + + +class FileManager: + DATA_FILE_AUTHORITY = 0o640 + DATA_DIR_AUTHORITY = 0o750 + + @classmethod + def read_csv_file(cls, file_path: str, class_bean: any) -> list: + PathManager.check_path_readable(file_path) + base_name = os.path.basename(file_path) + file_size = os.path.getsize(file_path) + if file_size <= 0: + return [] + if file_size > Constant.MAX_CSV_SIZE: + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + result_data = [] + try: + with open(file_path, newline="") as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + result_data.append(class_bean(row)) + except Exception as e: + raise RuntimeError(f"Failed to read the file: {base_name}") from e + return result_data + + @classmethod + def read_json_file(cls, file_path: str) -> dict: + PathManager.check_path_readable(file_path) + base_name = os.path.basename(file_path) + file_size = os.path.getsize(file_path) + if file_size <= 0: + return {} + if file_size > Constant.MAX_JSON_SIZE: + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + try: + with open(file_path, "r") as json_file: + result_data = json.loads(json_file.read()) + except Exception as e: + raise RuntimeError(f"Failed to read the file: {base_name}") from e + return result_data + + @classmethod + def create_csv_file(cls, profiler_path: str, data: list, file_name: str, headers: list = None) -> None: + if not data: + return + output_path = os.path.join( + profiler_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + output_file = os.path.join(output_path, file_name) + base_name = os.path.basename(output_file) + PathManager.check_path_writeable(output_path) + try: + with os.fdopen( + os.open(output_file, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY), + 'w', newline="" + ) as file: + writer = csv.writer(file) + if headers: + writer.writerow(headers) + writer.writerows(data) + except Exception as e: + raise RuntimeError(f"Can't create file: {base_name}") from e + + @classmethod + def create_json_file(cls, profiler_path: str, data: dict, file_name: str) -> None: + if not data: + return + output_path = os.path.join(profiler_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + output_file = os.path.join(output_path, file_name) + base_name = os.path.basename(output_file) + PathManager.check_path_writeable(output_path) + try: + with os.fdopen( + os.open(output_file, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY), 'w' + ) as file: + file.write(json.dumps(data)) + except Exception as e: + raise RuntimeError(f"Can't create the file: {base_name}") from e + + @classmethod + def create_output_dir(cls, collection_path: str, is_overwrite: bool = False) -> None: + output_path = os.path.join( + collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + if is_overwrite: + if not os.path.exists(output_path): + PathManager.make_dir_safety(output_path) + return + PathManager.remove_path_safety(output_path) + PathManager.make_dir_safety(output_path) + + @classmethod + def check_file_size(cls, file_path): + suffix = os.path.splitext(file_path) + base_name = os.path.join(file_path) + if suffix == Constant.CSV_SUFFIX: + limit_size = Constant.MAX_CSV_SIZE + else: + limit_size = Constant.MAX_JSON_SIZE + file_size = os.path.getsize(file_path) + if file_size > limit_size: + raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + + +def check_db_path_valid(path: str, is_create: bool = False, max_size: int = Constant.MAX_READ_DB_FILE_BYTES) -> bool: + if os.path.islink(path): + print(f'[ERROR] The db file path: {path} is link. Please check the path') + return False + if not is_create and os.path.exists(path) and os.path.getsize(path) > max_size: + print(f'[ERROR] The db file: {path} is too large to read. Please check the file') + return False + return True diff --git a/profiler/cluster_analyse_review/common_func/path_manager.py b/profiler/cluster_analyse_review/common_func/path_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..7ef7b4c345c024a0980c6ce2d91839b64c351740 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/path_manager.py @@ -0,0 +1,200 @@ +# Copyright (c) 2023 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import shutil +import platform + + +class PathManager: + MAX_PATH_LENGTH = 4096 + MAX_FILE_NAME_LENGTH = 255 + DATA_FILE_AUTHORITY = 0o640 + DATA_DIR_AUTHORITY = 0o750 + WINDOWS = "windows" + + @classmethod + def check_input_directory_path(cls, path: str): + """ + Function Description: + check whether the path is valid, some businesses can accept a path that does not exist, + so the function do not verify whether the path exists + Parameter: + path: the path to check, whether the incoming path is absolute or relative depends on the business + Exception Description: + when invalid data throw exception + """ + cls.input_path_common_check(path) + base_name = os.path.basename(path) + if os.path.isfile(path): + msg = f"Invalid input path which is a file path: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_input_file_path(cls, path: str): + """ + Function Description: + check whether the file path is valid, some businesses can accept a path that does not exist, + so the function do not verify whether the path exists + Parameter: + path: the file path to check, whether the incoming path is absolute or relative depends on the business + Exception Description: + when invalid data throw exception + """ + cls.input_path_common_check(path) + base_name = os.path.basename(path) + if os.path.isdir(path): + msg = f"Invalid input path which is a directory path: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_path_length(cls, path: str): + if len(path) > cls.MAX_PATH_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + path_split_list = path.split("/") + for path in path_split_list: + path_list = path.split("\\") + for name in path_list: + if len(name) > cls.MAX_FILE_NAME_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + + @classmethod + def input_path_common_check(cls, path: str): + if len(path) > cls.MAX_PATH_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + + if os.path.islink(path): + msg = f"Invalid input path which is a soft link." + raise RuntimeError(msg) + + if platform.system().lower() == cls.WINDOWS: + pattern = r'(\.|:|\\|/|_|-|\s|[~0-9a-zA-Z\u4e00-\u9fa5])+' + else: + pattern = r'(\.|/|_|-|\s|[~0-9a-zA-Z])+' + if not re.fullmatch(pattern, path): + msg = f"Invalid input path." + raise RuntimeError(msg) + + path_split_list = path.split("/") + for path in path_split_list: + path_list = path.split("\\") + for name in path_list: + if len(name) > cls.MAX_FILE_NAME_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + + @classmethod + def check_path_owner_consistent(cls, path: str): + """ + Function Description: + check whether the path belong to process owner + Parameter: + path: the path to check + Exception Description: + when invalid path, prompt the user + """ + base_name = os.path.basename(path) + if not os.path.exists(path): + msg = f"Invalid path: {base_name}" + raise RuntimeError(msg) + if platform.system().lower() == cls.WINDOWS: + return + if os.stat(path).st_uid != os.getuid(): + check_msg = input("The path does not belong to you, do you want to continue? [y/n]") + if check_msg.lower() != "y": + raise RuntimeError("The user choose not to continue.") + + @classmethod + def check_path_writeable(cls, path): + """ + Function Description: + check whether the path is writable + Parameter: + path: the path to check + Exception Description: + when invalid data throw exception + """ + cls.check_path_owner_consistent(path) + if os.path.islink(path): + msg = f"Invalid path which is a soft link." + raise RuntimeError(msg) + base_name = os.path.basename(path) + if not os.access(path, os.W_OK): + msg = f"The path permission check failed: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_path_readable(cls, path): + """ + Function Description: + check whether the path is writable + Parameter: + path: the path to check + Exception Description: + when invalid data throw exception + """ + cls.check_path_owner_consistent(path) + if os.path.islink(path): + msg = f"Invalid path which is a soft link." + raise RuntimeError(msg) + base_name = os.path.basename(path) + if not os.access(path, os.R_OK): + msg = f"The path permission check failed: {base_name}" + raise RuntimeError(msg) + + @classmethod + def remove_path_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to remove path: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + try: + shutil.rmtree(path) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def make_dir_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to make directory: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.makedirs(path, mode=cls.DATA_DIR_AUTHORITY) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def create_file_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to create file: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.close(os.open(path, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY)) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def get_realpath(cls, path: str) -> str: + if os.path.islink(path): + msg = f"Invalid input path which is a soft link." + raise RuntimeError(msg) + return os.path.realpath(path) diff --git a/profiler/cluster_analyse_review/common_func/sql_extention_func.py b/profiler/cluster_analyse_review/common_func/sql_extention_func.py new file mode 100644 index 0000000000000000000000000000000000000000..987a0d4365307704d6abf32575a48cc15c0fa33d --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/sql_extention_func.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class Median: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.median(self.data) + + +class LowerQuartile: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.quantile(self.data, 0.25) + + +class UpperQuartile: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.quantile(self.data, 0.75) + + +class StandardDeviation: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.std(self.data) + + +# func_name, params_count, class +SqlExtentionAggregateFunc = [ + ('median', 1, Median), + ('lower_quartile', 1, LowerQuartile), + ('upper_quartile', 1, UpperQuartile), + ('stdev', 1, StandardDeviation) +] diff --git a/profiler/cluster_analyse_review/common_func/table_constant.py b/profiler/cluster_analyse_review/common_func/table_constant.py new file mode 100644 index 0000000000000000000000000000000000000000..de6d47e97e5683493905de5353a9978195e87b70 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/table_constant.py @@ -0,0 +1,27 @@ +class TableConstant: + + RANK_SET = "rank_set" + STEP = "step" + RANK_ID = "rank_id" + TYPE = "type" + HCCL_OP_NAME = "hccl_op_name" + GROUP_NAME = "group_name" + START_TIMESTAMP = "start_timestamp" + ELAPSED_TIME = "elapse_time" + TRANSIT_TIME = "transit_time" + WAIT_TIME = "wait_time" + SYNCHRONIZATION_TIME = "synchronization_time" + IDLE_TIME = "idle_time" + SYNCHRONIZATION_TIME_RATIO = "synchronization_time_ratio" + WAIT_TIME_RATIO = "wait_time_ratio" + BAND_TYPE = "band_type" + TRANSIT_SIZE = "transit_size" + BANDWIDTH = "bandwidth" + LARGE_PACKET_RATIO = "large_packet_ratio" + PACKAGE_SIZE = "package_size" + COUNT = "count" + TOTAL_DURATION = "total_duration" + SRC_RANK = "src_rank" + DST_RANK = "dst_rank" + TRANSPORT_TYPE = "transport_type" + OPNAME = "op_name" diff --git a/profiler/cluster_analyse_review/common_func/tables_config.py b/profiler/cluster_analyse_review/common_func/tables_config.py new file mode 100644 index 0000000000000000000000000000000000000000..f010014519f864e627f83b99ad0df26af98af3f9 --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/tables_config.py @@ -0,0 +1,73 @@ +class TablesConfig: + DATA = { + "ClusterCommAnalyzerTimeMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("rank_id", "INTEGER, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("start_timestamp", "NUMERIC, null"), + ("elapsed_time", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("wait_time", "NUMERIC, null"), + ("synchronization_time", "NUMERIC, null"), + ("idle_time", "NUMERIC, null"), + ("synchronization_time_ratio", "NUMERIC, null"), + ("wait_time_ratio", "NUMERIC, null") + ], + "CommunicationGroupMap": [ + ("type", "TEXT, null"), + ("rank_set", "TEXT, null") + ], + "ClusterCommAnalyzerBandwidthMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("rank_id", "INTEGER, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("band_type", "TEXT, null"), + ("transit_size", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("bandwidth", "NUMERIC, null"), + ("large_packet_ratio", "NUMERIC, null"), + ("package_size", "NUMERIC, null"), + ("count", "NUMERIC, null"), + ("total_duration", "NUMERIC, null") + ], + "ClusterCommAnalyzerMatrixMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("src_rank", "TEXT, null"), + ("dst_rank", "TEXT, null"), + ("transit_size", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("bandwidth", "NUMERIC, null"), + ("transport_type", "TEXT, null"), + ("op_name", "TEXT, null") + ], + "ClusterStepTraceTimeMap": [ + ("step", "TEXT, null"), + ("type", "TEXT, null"), + ("index", "TEXT, null"), + ("computing", "NUMERIC, null"), + ("communication_not_overlapped", "NUMERIC, null"), + ("overlapped", "NUMERIC, null"), + ("communication", "NUMERIC, null"), + ("free", "NUMERIC, null"), + ("stage", "NUMERIC, null"), + ("bubble", "NUMERIC, null"), + ("communication_not_overlapped_and_exclude_receive", "NUMERIC, null"), + ("preparing", "NUMERIC, null") + ], + "HostInfoMap": [ + ("hostUid", "INTEGER, null"), + ("hostName", "TEXT, null") + ], + "RankDeviceMapMap": [ + ("rankId", "INTEGER, null"), + ("deviceId", "INTEGER, null"), + ("hostUid", "INTEGER, null") + ] + } diff --git a/profiler/cluster_analyse_review/common_func/utils.py b/profiler/cluster_analyse_review/common_func/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0a20a5c237f9f46e7b7425ef4b295dad4656174e --- /dev/null +++ b/profiler/cluster_analyse_review/common_func/utils.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + + +def format_columns(df: pd.DataFrame): + formatted_df = df.rename( + { + "25%": "Q1Ns", + "50%": "MedianNs", + "75%": "Q3Ns", + 0.25: "Q1Ns", + 0.5: "MedianNs", + 0.75: "Q3Ns", + "Q1": "Q1Ns", + "Q3": "Q3Ns", + "min": "MinNs", + "max": "MaxNs", + "median": "MedianNs", + "sum": "SumNs", + "std": "StdNs", + "mean": "MeanNs", + "count": "Count" + }, + axis="columns" + ) + + stats_cols = ["Count", "MeanNs", "StdNs", "MinNs", "Q1Ns", "MedianNs", "Q3Ns", "MaxNs", "SumNs"] + other_cols = [col for col in formatted_df.columns if col not in stats_cols] + return formatted_df[stats_cols + other_cols] + + +def describe_duration(series_groupby): + agg_df = series_groupby.agg(["min", "max", "count", "std", "mean", "sum"]) + quantile_df = series_groupby.quantile([0.25, 0.5, 0.75]) + + quantile_df = quantile_df.unstack() + quantile_df.columns = ["25%", "50%", "75%"] + + stats_df = pd.merge(agg_df, quantile_df, left_index=True, right_index=True) + formated_df = format_columns(stats_df) + formated_df.index.name = stats_df.index.name + return formated_df + + +def stdev(df, aggregated): + if len(df) <= 1: + return df["stdevNs"].iloc[0] + instance = aggregated["totalCount"].loc[df.name] + var_sum = np.dot(df["totalCount"] - 1, df["stdev"] ** 2) + deviation = df["averageNs"] - aggregated["averageNs"].loc[df.name] + dev_sum = np.dot(df["totalCount"], deviation ** 2) + return np.sqrt((var_sum + dev_sum) / (instance - 1)) + + +def convert_unit(df: pd.DataFrame, src_unit, dst_unit): + df.loc[:, df.columns.str.endswith(src_unit)] = df.loc[:, df.columns.str.endswith(src_unit)].apply(lambda x: x / 1000.0) + df = df.rename(columns=lambda x: x.replace(src_unit, "".join(["(", dst_unit, ")"]))) + return df diff --git a/profiler/cluster_analyse_review/communication_group/__init__.py b/profiler/cluster_analyse_review/communication_group/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/communication_group/base_communication_group.py b/profiler/cluster_analyse_review/communication_group/base_communication_group.py new file mode 100644 index 0000000000000000000000000000000000000000..23d7cb2986814e6e8cb45ac4ee9003f227ac881f --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/base_communication_group.py @@ -0,0 +1,227 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from collections import defaultdict +from copy import deepcopy +from multiprocessing import Pool + +from common_func.constant import Constant +from cluster_utils.data_transfer_adapter import DataTransferAdapter + + +class BaseCommunicationGroup: + def __init__(self, params: dict): + self.collection_path = params.get(Constant.COLLECTION_PATH) + self.data_map = params.get(Constant.DATA_MAP) + self.data_type = params.get(Constant.DATA_TYPE) + self.analysis_mode = params.get(Constant.ANALYSIS_MODE) + self.rank_comm_dir_dict = {} + self.p2p_link = [] + self.collective_group_dict = defaultdict(set) + self.p2p_comm_group = [] + self.communication_group = {} + self.communication_ops = [] + self.matrix_ops = [] + self.adapter = DataTransferAdapter() + + def load_communication_data(self): + comm_op_dirs = [] + for rank_id, profiling_dir_path in self.data_map.items(): + if self.data_type == Constant.TEXT: + comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON) + matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON) + else: + comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.DB_COMMUNICATION_ANALYZER) + matrix_dir = comm_dir + if os.path.exists(comm_dir) or os.path.exists(matrix_dir): + comm_op_dirs.append((rank_id, comm_dir, matrix_dir)) + else: + print( + f"[WARNING] Rank {rank_id} does not have valid communication data and communication_matrix data.") + with Pool() as p: + self.rank_comm_dir_dict = p.map(self.read_communication_func, comm_op_dirs) + + def set_p2p_groups(self): + self.p2p_link = sorted(self.p2p_link, key=lambda x: min(x)) + while self.p2p_link: + union_set = deepcopy(self.p2p_link[0]) + rm_list = [self.p2p_link[0]] + for idx, link_rank_set_x in enumerate(self.p2p_link[1:]): + if UnionFind.is_connected(link_rank_set_x, union_set): + union_set = union_set.union(link_rank_set_x) + rm_list.append(link_rank_set_x) + self.p2p_comm_group.append(union_set) + self.p2p_link = [element for element in self.p2p_link if element not in rm_list] + + def generate_collective_communication_group(self): + self.communication_group[Constant.COLLECTIVE] = \ + [list(group) for group_name, group in self.collective_group_dict.items()] + + def generate_p2p_communication_group(self): + stage_group = {} + for group_name, rank_set in self.collective_group_dict.items(): + if not self.whether_valid_comm_group(rank_set): + continue + unioned_set = set() + remove_key = [] + for first_rank, stage in stage_group.items(): + if UnionFind.is_connected(rank_set, stage): + unioned_set = UnionFind.union(rank_set, stage, unioned_set) + remove_key.append(first_rank) + if unioned_set: + for key in remove_key: + del stage_group[key] + stage_group[min(unioned_set)] = unioned_set + else: + stage_group[min(rank_set)] = rank_set + first_rank_sort_list = sorted([first_rank for first_rank in stage_group]) + self.communication_group[Constant.P2P] = \ + [list(stage_group.get(first_rank, {})) for first_rank in first_rank_sort_list] + + def whether_valid_comm_group(self, rank_set: set): + """ + while distinguish which communication group should be used to infer stage info, these group should be ignored: + 1. group can not include more than 1 rank in every single p2p group + """ + for p2p_rank_set in self.p2p_comm_group: + if len(rank_set.intersection(p2p_rank_set)) > 1: + return False + return True + + @abstractmethod + def read_communication_func(self, params: tuple): + pass + + def analyze_communication_data(self): + for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict: + for step_id, step_id_dict in rank_id_comm_dict.items(): + if not isinstance(step_id_dict, dict): + print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.") + continue + self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) + for comm_op_type, comm_op_dict in step_id_dict.items(): + self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict) + + for step_id, step_id_dict in rank_id_matrix_dict.items(): + if not isinstance(step_id_dict, dict): + print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.") + continue + self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict) + self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) + + @abstractmethod + def dump_data(self): + pass + + def collect_comm_data(self): + comm_data_dict = { + Constant.COLLECTIVE_GROUP: self.collective_group_dict, + Constant.COMMUNICATION_OPS: self.communication_ops, + Constant.MATRIX_OPS: self.matrix_ops, + Constant.COMMUNICATION_GROUP: self.communication_group + } + return comm_data_dict + + def generate(self): + self.load_communication_data() + self.analyze_communication_data() + self.set_p2p_groups() + self.generate_collective_communication_group() + self.generate_p2p_communication_group() + self.dump_data() + return self.collect_comm_data() + + def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict): + ops = rank_id_matrix_dict.get(step_id, {}) + self.add_matrix_ops(rank_id, step_id, ops) + if not ops: + print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.") + return + p2p_ops = ops.get(Constant.P2P, {}) + for op_name, link_dict in p2p_ops.items(): + self.append_p2p_link(op_name, link_dict) + + def append_p2p_link(self, op_name, link_dict): + for link in link_dict: + if '-' not in link: + print(f"[WARNING] {op_name} has an invalid link key {link}!") + break + src_rank = int(link.split('-')[0]) + dst_rank = int(link.split('-')[1]) + if src_rank != dst_rank: + rank_set = {src_rank, dst_rank} + if rank_set in self.p2p_link: + continue + self.p2p_link.append(rank_set) + + def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict): + for comm_op in comm_op_dict: + if comm_op.startswith('Total'): + continue + group_name = comm_op.split('@')[-1] + self.collective_group_dict[group_name].add(rank_id) + + def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict): + for comm_op in comm_op_dict: + if comm_op.startswith('Total'): + continue + group_name = comm_op.split('@')[-1] + self.communication_ops.append({ + Constant.RANK_ID: rank_id, + Constant.STEP_ID: step_id, + Constant.COMM_OP_TYPE: comm_op_type, + Constant.COMM_OP_NAME: comm_op, + Constant.GROUP_NAME: group_name, + Constant.COMM_OP_INFO: comm_op_dict.get(comm_op) + }) + + def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict): + for comm_op_type, comm_dict in step_id_dict.items(): + if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P: + print(f"[WARNING] Unknown communication operators type!") + continue + for op_name, op_link_info in comm_dict.items(): + if op_name.startswith('Total'): + continue + group_name = op_name.split('@')[-1] + self.matrix_ops.append({ + Constant.RANK_ID: rank_id, + Constant.STEP_ID: step_id, + Constant.COMM_OP_TYPE: comm_op_type, + Constant.COMM_OP_NAME: op_name, + Constant.GROUP_NAME: group_name, + Constant.COMM_OP_INFO: op_link_info + }) + + +class UnionFind(object): + """Disjoint Set Union""" + + @classmethod + def union(cls, first_set: set, second_set: set, third_set: set): + """make p and q the same set""" + return first_set | second_set | third_set + + @classmethod + def is_connected(cls, first_set: set, second_set: set): + """ + check whether set p and set q are connected + """ + if first_set & second_set: + return True + else: + return False diff --git a/profiler/cluster_analyse_review/communication_group/communication_db_group.py b/profiler/cluster_analyse_review/communication_group/communication_db_group.py new file mode 100644 index 0000000000000000000000000000000000000000..510dcd971357dfb4798e4d284a72fbb3f3a21859 --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/communication_db_group.py @@ -0,0 +1,57 @@ +import os + +from common_func.db_manager import DBManager +from common_func.constant import Constant +from communication_group.base_communication_group import BaseCommunicationGroup + + +class CommunicationDBGroup(BaseCommunicationGroup): + COMMUNICATION_GROUP_TABLE = "CommunicationGroup" + + def __init__(self, params: dict): + super().__init__(params) + + def read_communication_func(self, params: tuple): + if len(params) < 3: + return -1, ({}, {}, {}) + rank_id = params[0] + db_path = params[1] + time_data = [] + bandwidth_data = [] + matrix_data = [] + if os.path.exists(db_path): + conn, cursor = DBManager.create_connect_db(db_path) + time_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_TIME) + bandwidth_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_BANDWIDTH) + matrix_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_MATRIX) + if (DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_TIME, + Constant.TABLE_COMM_ANALYZER_BANDWIDTH) + and self.analysis_mode in ["all", "communication_time"]): + time_data = DBManager.fetch_all_data(cursor, time_info_sql) + bandwidth_data = DBManager.fetch_all_data(cursor, bandwidth_info_sql) + if (DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_MATRIX) + and self.analysis_mode in ["all", "communication_matrix"]): + matrix_data = DBManager.fetch_all_data(cursor, matrix_info_sql) + DBManager.destroy_db_connect(conn, cursor) + comm_data = self.adapter.transfer_comm_from_db_to_json(time_data, bandwidth_data) + comm_matrix_data = self.adapter.transfer_matrix_from_db_to_json(matrix_data) + return rank_id, comm_data, comm_matrix_data + + def dump_data(self): + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + res = [] + for data_type, data_list in self.communication_group.items(): + for data in data_list: + rank_set = "(" + ",".join(str(i) for i in data) + ")" + data = [data_type, rank_set] + res.append(data) + if res: + DBManager.create_tables(result_db, self.COMMUNICATION_GROUP_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + sql = "insert into {} values ({value})".format(self.COMMUNICATION_GROUP_TABLE, + value="?," * (len(res[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res) + DBManager.destroy_db_connect(conn, cursor) + else: + print("[WARNING] The CommunicationGroup table won't be created because no data has been calculated.") diff --git a/profiler/cluster_analyse_review/communication_group/communication_group_generator.py b/profiler/cluster_analyse_review/communication_group/communication_group_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..3dca90454b608fe3ffb1c365854c2aa3950b6cee --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/communication_group_generator.py @@ -0,0 +1,32 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from common_func.constant import Constant +from communication_group.communication_db_group import CommunicationDBGroup +from communication_group.communication_json_group import CommunicationJsonGroup + + +class CommunicationGroupGenerator: + + GROUP_MAP = { + Constant.DB: CommunicationDBGroup, + Constant.TEXT: CommunicationJsonGroup + } + + def __init__(self, params: dict): + self.processor = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params) + + def generate(self): + return self.processor.generate() diff --git a/profiler/cluster_analyse_review/communication_group/communication_json_group.py b/profiler/cluster_analyse_review/communication_group/communication_json_group.py new file mode 100644 index 0000000000000000000000000000000000000000..f6e01e3abfde4d8f180043a5bf9a50c6b5a4964c --- /dev/null +++ b/profiler/cluster_analyse_review/communication_group/communication_json_group.py @@ -0,0 +1,44 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from common_func.constant import Constant +from common_func.file_manager import FileManager +from communication_group.base_communication_group import BaseCommunicationGroup + + +class CommunicationJsonGroup(BaseCommunicationGroup): + COMMUNICATION_GROUP_JSON = "communication_group.json" + + def __init__(self, params: dict): + super().__init__(params) + + def dump_data(self): + FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON) + + def read_communication_func(self: any, params: tuple): + if len(params) < 3: + return -1, {}, {} + rank_id = params[0] + comm_json_path = params[1] + matrix_json_path = params[2] + comm_data = {} + matrix_data = {} + if os.path.exists(comm_json_path) and self.analysis_mode in ["all", "communication_time"]: + comm_data = FileManager.read_json_file(comm_json_path) + if os.path.exists(matrix_json_path) and self.analysis_mode in ["all", "communication_matrix"]: + matrix_data = FileManager.read_json_file(matrix_json_path) + return rank_id, comm_data, matrix_data diff --git a/profiler/cluster_analyse_review/prof_bean/__init__.py b/profiler/cluster_analyse_review/prof_bean/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/cluster_analyse_review/prof_bean/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse_review/prof_bean/step_trace_time_bean.py b/profiler/cluster_analyse_review/prof_bean/step_trace_time_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..b0a3be4f5eaccea70aa912bc85e68d70dbda3bde --- /dev/null +++ b/profiler/cluster_analyse_review/prof_bean/step_trace_time_bean.py @@ -0,0 +1,39 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class StepTraceTimeBean: + STEP = "Step" + COMPLEMENT_HEADER = ["Step", "Type", "Index"] + + def __init__(self, data: list): + self._data = data + + @property + def row(self) -> list: + row = [] + for field_name in self._data.keys(): + if field_name == self.STEP: + continue + row.append(float(self._data.get(field_name, ))) + return row + + @property + def step(self) -> str: + return self._data.get(self.STEP, '') + + @property + def all_headers(self) -> list: + return self.COMPLEMENT_HEADER + list(self._data.keys())[1:] diff --git a/profiler/cluster_analyse_review/resources/.keep b/profiler/cluster_analyse_review/resources/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/README.md b/profiler/compare_tools_review/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be56b4d993ce20e0778611c53af41421d5577287 --- /dev/null +++ b/profiler/compare_tools_review/README.md @@ -0,0 +1,301 @@ +# 性能比对工具 + +compare_tools(性能比对工具)支持比较GPU与NPU之间、NPU与NPU之间的性能差异,通过对训练耗时和内存占用的比对分析,定位到具体劣化的算子,帮助用户提升性能调优的效率。工具将训练耗时拆分为计算、通信、调度三大维度,并针对计算和通信分别进行算子级别的比对;将训练占用的总内存,拆分成算子级别的内存占用进行比对。 + +## 使用场景 + +场景一:PyTorch训练工程从GPU迁移至NPU后出现性能劣化,通过工具分析出劣化点。 + +场景二:PyTorch或MindSpore训练工程在NPU上,不同版本之间存在性能差距,通过工具定位具体差异。 + +场景三:PyTorch训练工程从GPU迁移至MindSpore NPU后出现性能劣化,通过工具分析出劣化点。 + +## 使用指导 + +### 环境依赖 + +使用本工具前需要安装的依赖包: + +```bash +pip3 install prettytable +pip3 install xlsxwriter +pip3 install pandas +pip3 install numpy +``` + +### PyTorch框架性能数据采集 + +使用本工具之前需要采集GPU或者NPU的性能数据,建议只采集一个step的性能数据,然后进行性能比对分析。 + +#### GPU性能数据采集 + +通过PyTorch Profiler工具采集GPU的性能数据,参考链接:[torch.profiler](https://pytorch.org/docs/stable/profiler.html)。 + +采集样例代码参考一: + +```Python +with torch.profiler.profile( + profile_memory=True, # 内存数据采集的开关 + record_shapes=True, # 算子input shape信息采集的开关 + schedule=torch.profiler.schedule(wait=10, warmup=0, active=1, repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler("./result_dir") +) as prof: + for step in ranges(step_number): + train_one_step() + prof.step() +``` + +采集样例代码参考二: + +```Python +prof = torch.profiler.profile( + profile_memory=True, # 内存数据采集的开关 + record_shapes=True, # 算子input shape信息采集的开关 + on_trace_ready=torch.profiler.tensorboard_trace_handler("./result_dir")) +for step in range(step_number): + if step == 11: + prof.start() + train_one_step() + if step == 11: + prof.stop() +``` + +PyTorch Profiler采集结果数据目录结构如下: + +```Python +|- pytorch_profiling + |- *.pt.trace.json +``` + +#### NPU性能数据采集 + +通过Ascend PyTorch Profiler工具采集NPU的性能数据,采集参数配置与GPU基本一致,只需将GPU的性能数据采集代码中torch.profiler替换成torch_npu.profiler。,参考链接:[Profiling数据采集](https://gitee.com/ascend/att/tree/master/profiler)。 + +Ascend PyTorch Profiler采集结果数据目录结构如下: + +```bash +|- ascend_pytorch_profiling + |- * _ascend_pt + |- ASCEND_PROFILER_OUTPUT + |- trace_view.json + |- FRAMEWORK + |- PROF_XXX + |- * _ascend_pt +``` + +### MindSpore框架性能数据采集 + +#### NPU性能数据采集 + +当前MindSpore场景仅支持NPU环境性能数据与PyTorch GPU性能数据进行比对;以及MindSpore训练工程在NPU上,不同版本之间的性能数据进行比对。 + +通过MindSpore性能调试工具采集NPU的性能数据,建议只采集或只解析一个step的性能数据,参考链接:[性能调试(Ascend)](https://www.mindspore.cn/mindinsight/docs/zh-CN/r2.3/performance_profiling_ascend.html)。 + +MindSpore性能调试工具采集结果数据目录结构如下: + +``` +|- profiler/{rank-*}_{timestamps}_ascend_ms + |- ASCEND_PROFILER_OUTPUT + |- kernel_details.csv + |- trace_view.json +``` + +进行性能比对时,MindSpore采集的性能数据须指定到`profiler/{rank-*}_{timestamps}_ascend_ms`或`ASCEND_PROFILER_OUTPUT`层级。 + +### 性能数据比对 + +性能比对工具将总体性能拆解为训练耗时和内存占用,其中训练耗时可拆分为算子(包括算子和nn.Module)、通信、调度三个维度,以打屏的形式输出总体指标,帮助用户定界劣化的方向。与此同时,工具还会生成performance_comparison_result_*.xlsx,展示每个算子在执行耗时、通信耗时、内存占用的优劣,可通过DIFF列大于0筛选出劣化算子。详细介绍请参见“**比对结果说明**”。 + +性能比对工具支持使用**命令行**和**脚本**两种方式执行性能数据比对操作,这两种方式均支持**通用参数**和**算子性能比对特有参数**。 + +#### 命令行方式 + +1. 参见《[性能工具](../README.md)》完成工具安装。 + +2. 执行如下命令进行性能数据比对: + + ``` + msprof-analyze compare -d [比对性能数据文件所在路径] -bp [基准性能数据文件所在路径] --output_path=[比对结果文件存放路径] + ``` + + - -d(必选):比对性能数据文件所在路径。可以指定以“ascend_pt”结尾的目录、ASCEND_PROFILER_OUTPUT目录或trace_view.json文件,指定trace_view.json无法显示算子的内存占用。 + - -bp(必选):基准性能数据文件所在路径。基准性能数据文件若以GPU为基准,指定到以".pt.trace"结尾的json文件;若以NPU不同版本为基准,指定文件与-d一致。 + - --output_path(可选):性能比对结果存放的路径,默认保存在当前目录。 + +#### 脚本方式 + +将att代码仓下载到本地,执行如下命令: + +```bash +# 进入att代码仓目录下的compare_tools目录 +cd att/profiler/compare_tools +# 执行最简比对命令 +python performance_compare.py [基准性能数据文件所在路径] [比对性能数据文件所在路径] --output_path=[比对结果文件存放路径] +``` + +- 基准性能数据文件所在路径(必选):若以GPU为基准,指定到以".pt.trace"结尾的json文件;若以NPU不同版本为基准,指定文件参考**比对性能数据文件所在路径**。 +- 比对性能数据文件所在路径(必选):可以指定以“ascend_pt”结尾的目录、ASCEND_PROFILER_OUTPUT目录或trace_view.json文件,指定trace_view.json无法显示算子的内存占用。 +- --output_path(可选):性能比对结果存放的路径,默认保存在当前目录。 + +#### 通用参数说明 + +| 参数名 | 说明 | 是否必选 | +| ------------------------------ |---------------------------------------------------| -------- | +| --enable_profiling_compare | 开启总体性能比对。 | 否 | +| --enable_operator_compare | 开启算子性能比对。MindSpore场景暂不支持。该开关较耗时,建议只采集一个step的性能数据。 | 否 | +| --enable_communication_compare | 开启通信性能比对。 | 否 | +| --enable_memory_compare | 开启算子内存比对。MindSpore场景暂不支持。该开关较耗时,建议只采集一个step的性能数据。 | 否 | + +说明:以上4个开关均不设置的情况下,**工具默认开启所有的性能比对**,当用户设置了以上开关,则按照用户设置的开关进行性能比对,示例如下: + +```bash +msprof-analyze compare -d [比对性能数据文件所在路径] -bp [基准性能数据文件所在路径] --output_path=./result_dir --enable_profiling_compare +``` + +或 + +```bash +python performance_compare.py [基准性能数据文件] [比对性能数据文件] --output_path=./result_dir --enable_profiling_compare +``` + +此时表示仅开启总体性能比对。 + +#### 算子性能比对特有参数说明 + +| 参数名 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| --gpu_flow_cat | 配置GPU trace中CPU侧算子与device kernel的连线标识,当GPU的Device Duration(us)均为0时设置。使用chrome://tracing打开GPU的json,右上角Flow events找到连线标识,将标识配置进该参数。使用示例:--gpu_flow_cat=async_gpu | 否 | +| --use_input_shape | 开启算子精准匹配,默认关闭。使用示例:--use_input_shape | 否 | +| --max_kernel_num | 设置CPU侧算子下发的最大kernel数量,当超过设定值时工具会自动往下找子算子,直至满足条件。默认仅比对最上层算子,粒度较粗;若想要更细粒度的算子比对,可设置该参数,参数值不得小于4,参数值设置越小,比对粒度越细。使用示例:--max_kernel_num=10 | 否 | +| --op_name_map | 设置GPU与NPU等价的算子名称的映射关系,以字典形式存入。使用示例:--op_name_map={'Optimizer.step#SGD.step':'Optimizer.step#NpuFusedSGD.step'} | 否 | + +## 比对结果说明 + +MindSpore场景仅支持**总体性能**和**通信性能**的对比。 + +### 总体性能 + +总体性能比对结果以打屏的形式呈现。 + +| 字段 | 说明 | +| --------------------------------------- | ------------------------------------------------------------ | +| Cube Time(Num) | Cube算子总耗时,Num表示计算的次数。 | +| Vector Time(Num) | Vector算子总耗时,Num表示计算的次数。 | +| Conv Time(Forward)(Num) | conv前向算子耗时,Num表示计算的次数。 | +| Conv Time(Backward)(Num) | conv反向算子耗时,Num表示计算的次数。 | +| Flash Attention Time(Forward)(Num) | Flash Attention算子前向耗时,Num表示计算的次数。 | +| Flash Attention Time(Backward)(Num) | Flash Attention算子反向耗时,Num表示计算的次数。 | +| Paged Attention Time(Num) | Paged Attention算子耗时,Num表示计算的次数。 | +| Lccl Time(Num) | Lccl算子耗时,Num表示计算的次数。 | +| Computing Time | 计算流耗时,计算流所有event耗时总和。如果有多条并发计算,计算流耗时对重叠部分只会计算一次。 | +| Mem Usage | 内存使用。GPU上的内存使用可以使用nvidia-smi查看,NPU上的内存使用可以使用npu-smi查看,Profiling信息采集时打开profile_memory=True开关,mem usage显示的是memory_record里面的最大resevered值,一般来说是进程级内存。 | +| Uncovered Communication Time(Wait Time) | 通信未掩盖耗时,包含Wait Time(只有采集性能数据的Level等级为L1以上并且采集NPU数据时才会存在)为同步时间。 | +| SDMA Time(Num) | 拷贝类任务耗时,Num表示计算的次数。 | +| Free Time | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间,因此包含拷贝时间(SDMA Time)。 | +| E2E Time(Not minimal profiling) | E2E总耗时,计算流端到端耗时。当存在Not minimal profiling时,表示该时间存在性能膨胀,会影响通信和调度耗时。 | +| Other Time | AI CPU、DSA、TensorMove等其他算子耗时。 | + +可以采取最简性能数据采集的方式来减少E2E耗时的性能膨胀,示例代码如下: + +```python +with torch_npu.profiler.profile( + activities=[torch_npu.profiler.ProfilerActivity.NPU], + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=1, repeat=1, skip_first=10), + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result"), +) as prof: + for step in range(steps): + train_one_step() + prof.step() +``` + +activities配置仅采集NPU数据,不配置experimental_config参数以及其他可选开关。 + +- 当Computing Time耗时增大,分析**算子性能**。 +- 当Uncovered Communication Time耗时增大,分析**通信性能**,若通信性能分析没有劣化的通信算子,代表通信与计算的并行度较差,继续进行NPU的集群性能分析。 +- 当Mem Usage增大,分析**算子内存**,若没有明显占用较大的算子,则代表算子内存申请量没有差异,问题在于内存的释放(持有时间过久),可以使用tensorboard或ascend insight继续进行NPU内存的分析。 + +### 算子性能 + +MindSpore场景暂不支持。 + +#### 比对数据无Python Function + +算子性能比对结果在performance_comparison_result_*.xlsx中OperatorCompare和OperatorCompareStatistic的sheet页呈现。 + +- OperatorCompareStatistic:算子为粒度的统计呈现,按照算子在device上的总耗时与基准算子的差距值(Diff Duration(ms)列)进行逆序。 +- OperatorCompare:算子比对的明细展示,可以查看每一个算子对应的kernel详情。 +- Diff Ratio:比较算子在device上执行总耗时 / 基准算子在device上执行总耗时,红色代表劣化。 +- Device Duration(us):该算子下发到device上执行的所有kernel耗时的总和。 + +步骤1:查看OperatorCompareStatistic页,找出耗时差距TOP的算子。 +步骤2:查看OperatorCompare页,搜索耗时差距TOP的算子,查看具体执行的kernel耗时,寻找可优化点。 + +#### 比对数据有Python Function + +算子性能比对结果在performance_comparison_result_*.xlsx中ModuleCompareStatistic、ModuleCompare的sheet页呈现。 + +当用户采集时开启with_stack开关,会上报python function事件,当比对的双方数据都存在python function的事件时,可进行模块级别的比对。 + +- Module Class:Module名,如nn.Module: Linear。 +- Module Level:Module的层级。 +- Module Name:Module唯一标识名,如/ DynamicNet_0/ Linear_0。 +- Operator Name:框架侧算子名,如aten::add。字段为[ TOTAL ]代表该module的总体情况。 +- Kernel Detail:算子详细信息。 +- Device Self Time(ms):该模块调用的算子(排除子模块)在device侧执行的总耗时,单位ms。 +- Number:该Module或算子被调用的次数。 +- Device Total Time(ms):该模块调用的算子(包含子模块)在device侧执行的总耗时,单位ms。 +- Device Total Time Diff(ms):GPU与NPU的Device Total Time(ms)差值。 +- Device Self Time Diff(ms):GPU与NPU的Device Self Time(ms)差值。 +- Total Time Ratio:GPU与NPU的Device Total Time(ms)比值。 +- Base Call Stack:基准文件模块的调用栈。 +- Comparison Call Stack:比较文件模块的调用栈。 + +ModuleCompare:模块及模块下算子比对的明细展示,可以查看每一个算子对应的kernel详情。 + +- Module Class:Module名,如nn.Module: Linear。 +- Module Level:Module的层级。 +- Module Name:Module唯一标识名,如/ DynamicNet_0/ Linear_0。 +- Operator Name:框架侧算子名,如aten::add。字段为[ TOTAL ]代表该module的总体情况。 +- Kernel Detail:算子详细信息。 +- Device Self Time(us):该模块调用的算子(排除子模块)在device侧执行的总耗时,单位us。 +- Device Total Time(us):该模块调用的算子(包含子模块)在device侧执行的总耗时,单位us。 +- Device Total Time Diff(us):GPU与NPU的Device Total Time(us)差值。 +- Device Self Time Diff(us):GPU与NPU的Device Self Time(us)差值。 +- Total Time Ratio:GPU与NPU的Device Total Time(us)比值。 +- Base Call Stack:有劣化的模块或算子,基准文件模块的调用栈。 +- Comparison Call Stack:有劣化的模块或算子,比较文件模块的调用栈。 + +步骤1:查看ModuleCompareStatistic页,找出耗时差距TOP的模块。 + +​ 筛选Operator Name字段为[ TOTAL ],将模块总体情况按照Device Self Time(ms)字段逆序,可识别出耗时差距TOP的模块。 + +​ 恢复数据,可按照Order Id字段升序。 + +步骤2:查看ModuleCompare页,查找耗时差距TOP模块下的劣化算子。 + +步骤3:通过调用栈找到对应的代码行。 + +### 通信性能 + +通信性能比对结果在performance_comparison_result_*.xlsx中CommunicationCompare的sheet页呈现。 + +- 第二行表头:通信算子的summary信息,包括通信算子名称、调用总次数、通信算子总耗时(单位:us)、通信算子平均耗时(单位:us)、通信算子最大耗时(单位:us)、通信算子最小耗时(单位:us)。 +- 无背景色的记录行:通信算子的detail信息,仅支持NPU,包含了该通信算子下的所有Task信息,包括Task名称、Task调用次数、Task总耗时(单位:us)、Task平均耗时(单位:us)、Task最大耗时(单位:us)、Task最小耗时(单位:us)。 +- Diff Ratio: 比较通信算子的总耗时 / 基准通信算子的总耗时,红色代表劣化。 + +### 算子内存 + +MindSpore场景暂不支持。 + +算子内存比对结果在performance_comparison_result_*.xlsx中MemoryCompare和MemoryCompareStatistic的sheet页呈现。 + +- MemoryCompareStatistic:算子为粒度的统计呈现,按照算子占用的总内存与基准算子的差距值(Diff Memory(MB))进行逆序。 + +- MemoryCompare:算子内存比对的明细展示,可以查看每一个算子申请内存的详情。 + +- Diff Ratio: 比较算子占用的总内存 / 基准算子占用的总内存,红色代表劣化。 + +- Size(KB):该算子占用的device内存大小,单位KB。 + +步骤1:查看MemoryCompareStatistic页,找出内存占用差距TOP的算子。 +步骤2:查看MemoryCompare页,搜索内存占用差距TOP的算子,查看具体占用的子算子。 diff --git a/profiler/compare_tools_review/__init__.py b/profiler/compare_tools_review/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/compare_tools_review/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/compare_tools_review/compare_backend/__init__.py b/profiler/compare_tools_review/compare_backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/comparator/__init__.py b/profiler/compare_tools_review/compare_backend/comparator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/comparator/base_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/base_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..330fb871ee19b9bac1c0dfff4cae5648ebeedf1c --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/base_comparator.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod + + +class BaseComparator(ABC): + def __init__(self, origin_data: any, bean: any): + self._sheet_name = bean.TABLE_NAME + self._headers = bean.HEADERS + self._overhead = bean.OVERHEAD + self._origin_data = origin_data + self._bean = bean + self._rows = [] + + def generate_data(self) -> dict: + ''' + generate one sheet(table) data + type: dict + sheet name as the dict key + ''' + self._compare() + return {self._sheet_name: {"headers": self._headers, "rows": self._rows, "overhead": self._overhead}} + + @abstractmethod + def _compare(self): + raise NotImplementedError("Function _compare need to be implemented.") diff --git a/profiler/compare_tools_review/compare_backend/comparator/communication_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/communication_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..f7580bec89a85b8d23e0ec878eda944d95e69f3f --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/communication_comparator.py @@ -0,0 +1,20 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.compare_bean.communication_bean import CommunicationBean +from compare_backend.utils.constant import Constant +from compare_backend.utils.common_func import update_order_id + + +class CommunicationComparator(BaseComparator): + def __init__(self, origin_data: dict, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + base_data = self._origin_data.get(Constant.BASE_DATA, {}) + comparison_data = self._origin_data.get(Constant.COMPARISON_DATA, {}) + for comm_name, comm_data in base_data.items(): + comparison_comm_data = comparison_data.pop(comm_name, {}) + self._rows.extend(CommunicationBean(comm_name, comm_data, comparison_comm_data).rows) + for comm_name, comm_data in comparison_data.items(): + self._rows.extend(CommunicationBean(comm_name, {}, comm_data).rows) + update_order_id(self._rows) + diff --git a/profiler/compare_tools_review/compare_backend/comparator/module_comparetor.py b/profiler/compare_tools_review/compare_backend/comparator/module_comparetor.py new file mode 100644 index 0000000000000000000000000000000000000000..49c50b53c5a1b00bd17b7281d80b61d5011cb59a --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/module_comparetor.py @@ -0,0 +1,36 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id +from compare_backend.utils.constant import Constant + + +class ModuleComparator(BaseComparator): + def __init__(self, origin_data: any, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_all_data = [data for data in self._origin_data if data[0]] # index 0 for base module + base_all_data.sort(key=lambda x: x[0].start_time) + base_none_data = [data for data in self._origin_data if not data[0]] # index 0 for base module + base_none_data.sort(key=lambda x: x[1].start_time) + index = 0 + for base_module, comparison_module in base_all_data: + if not comparison_module: + self._rows.extend(self._bean(base_module, comparison_module).rows) + continue + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + if module.start_time < comparison_module.start_time: + self._rows.extend(self._bean(None, module).rows) + index += 1 + else: + break + self._rows.extend(self._bean(base_module, comparison_module).rows) + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + self._rows.extend(self._bean(None, module).rows) + index += 1 + update_order_id(self._rows) + if not any(row[-1] != Constant.NA for row in self._rows): + print(f"[WARNING] If you want to see the operator's call stack, you must enable with_stack switch.") diff --git a/profiler/compare_tools_review/compare_backend/comparator/module_statistic_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/module_statistic_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..e09108f3cbe3744068daf6c5316dc318aea53177 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/module_statistic_comparator.py @@ -0,0 +1,45 @@ +from collections import OrderedDict + +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id + + +class ModuleStatisticComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_module_dict, comparison_module_dict = self._group_by_module_name() + for module_name, base_data in base_module_dict.items(): + comparison_data = comparison_module_dict.pop(module_name, []) + self._rows.extend(self._bean(module_name, base_data, comparison_data).rows) + for module_name, comparison_data in comparison_module_dict.items(): + self._rows.extend(self._bean(module_name, [], comparison_data).rows) + update_order_id(self._rows) + + def _group_by_module_name(self): + base_module_dict, comparison_module_dict = OrderedDict(), OrderedDict() + base_all_data = [data for data in self._origin_data if data[0]] # index 0 for base module + base_all_data.sort(key=lambda x: x[0].start_time) + base_none_data = [data for data in self._origin_data if not data[0]] # index 0 for base module + base_none_data.sort(key=lambda x: x[1].start_time) + index = 0 + for base_module, comparison_module in base_all_data: + base_module_dict.setdefault(base_module.module_name, []).append(base_module) + if not comparison_module: + continue + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + if module.start_time < comparison_module.start_time: + comparison_module_dict.setdefault(module.module_name, []).append(module) + index += 1 + else: + break + comparison_module_dict.setdefault(comparison_module.module_name, []).append(comparison_module) + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + comparison_module_dict.setdefault(module.module_name, []).append(module) + index += 1 + return base_module_dict, comparison_module_dict diff --git a/profiler/compare_tools_review/compare_backend/comparator/operator_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/operator_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..cc475116cab59104a049689292f25f339a7285ce --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/operator_comparator.py @@ -0,0 +1,13 @@ +from compare_backend.comparator.base_comparator import BaseComparator + + +class OperatorComparator(BaseComparator): + def __init__(self, origin_data: any, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + self._rows = [None] * (len(self._origin_data)) + for index, (base_op, comparison_op) in enumerate(self._origin_data): + self._rows[index] = self._bean(index, base_op, comparison_op).row diff --git a/profiler/compare_tools_review/compare_backend/comparator/operator_statistic_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/operator_statistic_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..73aecf6f1283242311bcb0e848bd94f0f1afa377 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/operator_statistic_comparator.py @@ -0,0 +1,28 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id + + +class OperatorStatisticComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_op_dict, comparison_op_dict = self._group_by_op_name() + for op_name, base_data in base_op_dict.items(): + comparison_data = comparison_op_dict.pop(op_name, []) + self._rows.append(self._bean(op_name, base_data, comparison_data).row) + for op_name, comparison_data in comparison_op_dict.items(): + self._rows.append(self._bean(op_name, [], comparison_data).row) + self._rows.sort(key=lambda x: x[-2], reverse=True) # order by diff column + update_order_id(self._rows) + + def _group_by_op_name(self): + base_op_dict, comparison_op_dict = {}, {} + for base_op, comparison_op in self._origin_data: + if base_op: + base_op_dict.setdefault(base_op.name, []).append(base_op) + if comparison_op: + comparison_op_dict.setdefault(comparison_op.name, []).append(comparison_op) + return base_op_dict, comparison_op_dict diff --git a/profiler/compare_tools_review/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools_review/compare_backend/comparator/overall_performance_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..7283c17b47dea78058d0541c1332df0fa45e90d9 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparator/overall_performance_comparator.py @@ -0,0 +1,76 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.constant import Constant + + +class OverallPerformanceComparator(BaseComparator): + def __init__(self, origin_data: dict, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + base_profiling_info = self._origin_data.get(Constant.BASE_DATA) + comp_profiling_info = self._origin_data.get(Constant.COMPARISON_DATA) + self._headers = [''] + base_col = [f'{base_profiling_info.profiling_type}'] + comp_col = [f'{comp_profiling_info.profiling_type}'] + if not base_profiling_info.hide_op_details and not comp_profiling_info.hide_op_details: + self._headers.extend(['Cube Time(Num)', 'Vector Time(Num)']) + base_col.extend([f'{base_profiling_info.cube_time:.3f}s({base_profiling_info.cube_num})', + f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})']) + comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})', + f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})']) + if base_profiling_info.conv_time_fwd or comp_profiling_info.conv_time_fwd: + self._headers.append('Conv Time(Forward)(Num)') + base_col.append(f'{base_profiling_info.conv_time_fwd:.3f}s({base_profiling_info.conv_num_fwd})') + comp_col.append(f'{comp_profiling_info.conv_time_fwd:.3f}s({comp_profiling_info.conv_num_fwd})') + if base_profiling_info.conv_time_bwd or comp_profiling_info.conv_time_bwd: + self._headers.append('Conv Time(Backward)(Num)') + base_col.append(f'{base_profiling_info.conv_time_bwd:.3f}s({base_profiling_info.conv_num_bwd})') + comp_col.append(f'{comp_profiling_info.conv_time_bwd:.3f}s({comp_profiling_info.conv_num_bwd})') + if base_profiling_info.fa_time_fwd or comp_profiling_info.fa_time_fwd: + self._headers.append('Flash Attention Time(Forward)(Num)') + base_col.append(f'{base_profiling_info.fa_time_fwd:.3f}s({base_profiling_info.fa_num_fwd})') + comp_col.append(f'{comp_profiling_info.fa_time_fwd:.3f}s({comp_profiling_info.fa_num_fwd})') + if base_profiling_info.fa_time_bwd or comp_profiling_info.fa_time_bwd: + self._headers.append('Flash Attention Time(Backward)(Num)') + base_col.append(f'{base_profiling_info.fa_time_bwd:.3f}s({base_profiling_info.fa_num_bwd})') + comp_col.append(f'{comp_profiling_info.fa_time_bwd:.3f}s({comp_profiling_info.fa_num_bwd})') + if base_profiling_info.pa_time or comp_profiling_info.pa_time: + self._headers.append('Paged Attention Time(Num)') + base_col.append(f'{base_profiling_info.pa_time:.3f}s({base_profiling_info.pa_num})') + comp_col.append(f'{comp_profiling_info.pa_time:.3f}s({comp_profiling_info.pa_num})') + if base_profiling_info.lccl_time or comp_profiling_info.lccl_time: + self._headers.append('Lccl Time(Num)') + base_col.append(f'{base_profiling_info.lccl_time:.3f}s({base_profiling_info.lccl_num})') + comp_col.append(f'{comp_profiling_info.lccl_time:.3f}s({comp_profiling_info.lccl_num})') + if base_profiling_info.other_time or comp_profiling_info.other_time: + self._headers.append('Other Time') + base_col.append(f'{base_profiling_info.other_time:.3f}s') + comp_col.append(f'{comp_profiling_info.other_time:.3f}s') + self._headers.extend(['Computing Time']) + base_col.extend([f'{base_profiling_info.compute_time:.3f}s']) + comp_col.extend([f'{comp_profiling_info.compute_time:.3f}s']) + if base_profiling_info.memory_used or comp_profiling_info.memory_used: + self._headers.append('Mem Usage') + base_col.append(f'{base_profiling_info.memory_used:.2f}G') + comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') + self._headers.extend(['Uncovered Communication Time(Wait Time)']) + if base_profiling_info.wait_time: + base_col.extend( + [f'{base_profiling_info.communication_not_overlapped: .3f}s({base_profiling_info.wait_time:.3f}s)']) + else: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s( / )']) + if comp_profiling_info.is_level0: + comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s( / )']) + else: + comp_col.extend( + [f'{comp_profiling_info.communication_not_overlapped: .3f}s({comp_profiling_info.wait_time:.3f}s)']) + if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: + self._headers.append('SDMA Time(Num)') + base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') + comp_col.append(f'{comp_profiling_info.sdma_time:.3f}s({comp_profiling_info.sdma_num})') + cue = '(Not minimal profiling)' if base_profiling_info.is_not_minimal_profiling() or \ + comp_profiling_info.is_not_minimal_profiling() else '' + self._headers.extend(['Free Time', 'E2E Time' + cue]) + base_col.extend([f'{base_profiling_info.scheduling_time:.3f}s', f'{base_profiling_info.e2e_time:.3f}s']) + comp_col.extend([f'{comp_profiling_info.scheduling_time:.3f}s', f'{comp_profiling_info.e2e_time:.3f}s']) + self._rows = [base_col, comp_col] diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/__init__.py b/profiler/compare_tools_review/compare_backend/compare_bean/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/communication_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/communication_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..94813193d69b4a1f92cc88dbd1eb31d6f96ff608 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/communication_bean.py @@ -0,0 +1,72 @@ +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.common_func import calculate_diff_ratio + + +class CommunicationInfo: + + def __init__(self, name: str, data_list: list, is_task: bool): + self.comm_op_name = None + self.task_name = None + self.calls = None + self.total_duration = 0 + self.avg_duration = None + self.max_duration = None + self.min_duration = None + if data_list: + self.comm_op_name = "|" if is_task else name + self.task_name = name if is_task else None + self.calls = len(data_list) + self.total_duration = sum(data_list) + self.avg_duration = sum(data_list) / len(data_list) + self.max_duration = max(data_list) + self.min_duration = min(data_list) + + +class CommunicationBean: + TABLE_NAME = Constant.COMMUNICATION_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_comm_data: dict, comparison_comm_data: dict): + self._name = name + self._base_comm = base_comm_data + self._comparison_comm = comparison_comm_data + + @property + def rows(self): + rows = [] + base_comm = CommunicationInfo(self._name, self._base_comm.get("comm_list", []), is_task=False) + comparison_comm = CommunicationInfo(self._name, self._comparison_comm.get("comm_list", []), is_task=False) + rows.append(self._get_row(base_comm, comparison_comm, is_task=False)) + + base_task = self._base_comm.get("comm_task", {}) + comparison_task = self._comparison_comm.get("comm_task", {}) + if not base_task and not comparison_task: + return rows + + for task_name, task_list in base_task.items(): + base_task_info = CommunicationInfo(task_name, task_list, is_task=True) + comparison_task_info = CommunicationInfo("", [], is_task=True) + for _task_name, _task_list in comparison_task.items(): + comparison_task_info = CommunicationInfo(_task_name, _task_list, is_task=True) + comparison_task.pop(_task_name, None) + break + rows.append(self._get_row(base_task_info, comparison_task_info, is_task=True)) + for task_name, task_list in comparison_task.items(): + base_task_info = CommunicationInfo("", [], is_task=True) + comparison_task_info = CommunicationInfo(task_name, task_list, is_task=True) + rows.append(self._get_row(base_task_info, comparison_task_info, is_task=True)) + + return rows + + @classmethod + def _get_row(cls, base_info: CommunicationInfo, comparison_info: CommunicationInfo, is_task: bool) -> list: + row = [None, base_info.comm_op_name, base_info.task_name, base_info.calls, base_info.total_duration, + base_info.avg_duration, base_info.max_duration, base_info.min_duration, comparison_info.comm_op_name, + comparison_info.task_name, comparison_info.calls, comparison_info.total_duration, + comparison_info.avg_duration, comparison_info.max_duration, comparison_info.min_duration] + diff_fields = [None, None] if is_task else calculate_diff_ratio(base_info.total_duration, + comparison_info.total_duration) + row.extend(diff_fields) + return row diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/memory_compare_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/memory_compare_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..e1baa175311ae42765757feb8b13bbb3918c3727 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/memory_compare_bean.py @@ -0,0 +1,47 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.torch_op_node import TorchOpNode +from compare_backend.utils.tree_builder import TreeBuilder + + +class MemoryCompareBean: + TABLE_NAME = Constant.MEMORY_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, index: int, base_op: TorchOpNode, comparison_op: TorchOpNode): + self._index = index + self._base_op = MemoryInfo(base_op) + self._comparison_op = MemoryInfo(comparison_op) + + @property + def row(self): + row = [self._index + 1, self._base_op.operator_name, self._base_op.input_shape, self._base_op.input_type, + self._base_op.memory_details, self._base_op.size, self._comparison_op.operator_name, + self._comparison_op.input_shape, self._comparison_op.input_type, self._comparison_op.memory_details, + self._comparison_op.size] + diff_fields = calculate_diff_ratio(self._base_op.size, self._comparison_op.size) + row.extend(diff_fields) + return row + + +class MemoryInfo: + def __init__(self, torch_op: TorchOpNode): + self.operator_name = None + self.input_shape = None + self.input_type = None + self.size = 0 + self.memory_details = "" + self._memory_list = [] + if torch_op: + self.operator_name = torch_op.name + self.input_shape = torch_op.input_shape + self.input_type = torch_op.input_type + self._memory_list = TreeBuilder.get_total_memory(torch_op) + self._update_memory_fields() + + def _update_memory_fields(self): + for memory in self._memory_list: + self.size += memory.size + self.memory_details += memory.memory_details diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/memory_statistic_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/memory_statistic_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..9ccc2cb76da9158355aacb0994a1b66c0be97fb5 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/memory_statistic_bean.py @@ -0,0 +1,38 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.tree_builder import TreeBuilder +from compare_backend.utils.excel_config import ExcelConfig + + +class MemoryStatisticBean: + TABLE_NAME = Constant.MEMORY_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._name = name + self._base_info = MemoryStatisticInfo(base_data) + self._comparison_info = MemoryStatisticInfo(comparison_data) + + @property + def row(self): + row = [None, self._name, self._base_info.duration_ms, self._base_info.size_mb, self._base_info.number, + self._comparison_info.duration_ms, self._comparison_info.size_mb, self._comparison_info.number] + diff_fields = calculate_diff_ratio(self._base_info.size_mb, self._comparison_info.size_mb) + row.extend(diff_fields) + return row + + +class MemoryStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.duration_ms = 0 + self.size_mb = 0 + self.number = len(data_list) + self._get_info() + + def _get_info(self): + for op_data in self._data_list: + memory_list = TreeBuilder.get_total_memory(op_data) + self.duration_ms += sum([memory.duration / Constant.US_TO_MS for memory in memory_list]) + self.size_mb += sum([memory.size / Constant.KB_TO_MB for memory in memory_list]) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/module_compare_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/module_compare_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..abfce00d83d6c1a914aa71481277e2dc1c195f17 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/module_compare_bean.py @@ -0,0 +1,83 @@ +from compare_backend.utils.common_func import longest_common_subsequence_matching, calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.name_function import NameFunction +from compare_backend.utils.torch_op_node import TorchOpNode + + +class ModuleCompareBean: + TABLE_NAME = Constant.MODULE_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, base_module: ModuleNode, comparison_module: ModuleNode): + self._base_module = ModuleInfo(base_module) + self._comparison_module = ModuleInfo(comparison_module) + self.module_class = self._base_module.module_class if base_module else self._comparison_module.module_class + self.module_level = self._base_module.module_level if base_module else self._comparison_module.module_level + self.module_name = self._base_module.module_name if base_module else self._comparison_module.module_name + + @property + def rows(self): + return [self.get_total_row(), *self.get_detail_rows()] + + def get_total_row(self): + total_diff, total_ratio = calculate_diff_ratio(self._base_module.device_total_time, + self._comparison_module.device_total_time) + self_diff, _ = calculate_diff_ratio(self._base_module.device_self_time, + self._comparison_module.device_self_time) + return [None, self.module_class, self.module_level, self.module_name, "TOTAL", None, + self._base_module.device_self_time, self._base_module.device_total_time, "TOTAL", None, + self._comparison_module.device_self_time, self._comparison_module.device_total_time, total_diff, + self_diff, total_ratio, self._base_module.call_stack, self._comparison_module.call_stack] + + def get_detail_rows(self): + rows = [] + matched_ops = longest_common_subsequence_matching(self._base_module.top_layer_ops, + self._comparison_module.top_layer_ops, NameFunction.get_name) + for base_op, comparison_op in matched_ops: + base_op = OpInfo(base_op) + comparison_op = OpInfo(comparison_op) + self_diff, self_ratio = calculate_diff_ratio(base_op.device_self_time, comparison_op.device_self_time) + base_call_stack = base_op.call_stack if self_diff > 0 else None + comparison_call_stack = comparison_op.call_stack if self_diff > 0 else None + rows.append( + [None, self.module_class, self.module_level, self.module_name, base_op.operator_name, + base_op.kernel_details, base_op.device_self_time, None, comparison_op.operator_name, + comparison_op.kernel_details, comparison_op.device_self_time, None, None, self_diff, self_ratio, + base_call_stack, comparison_call_stack]) + return rows + + +class ModuleInfo: + def __init__(self, module: ModuleNode): + self.module_class = "" + self.module_level = "" + self.module_name = "" + self.device_self_time = 0 + self.device_total_time = 0 + self.top_layer_ops = [] + self.call_stack = "" + if module: + self.module_class = module.module_class + self.module_level = module.module_level + self.module_name = module.module_name.replace("nn.Module:", "") + self.device_self_time = module.device_self_dur + self.device_total_time = module.device_total_dur + self.top_layer_ops = module.toy_layer_api_list + self.call_stack = module.call_stack + + +class OpInfo: + def __init__(self, operator: TorchOpNode): + self.operator_name = "" + self.kernel_details = "" + self.device_self_time = 0 + self.call_stack = "" + if operator: + self.operator_name = operator.name + for kernel in operator.kernel_list: + self.device_self_time += kernel.device_dur + self.kernel_details += kernel.kernel_details + self.call_stack = operator.call_stack diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/module_statistic_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/module_statistic_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..97fc98bdd354e1ebe1fbb3fc44def4eaf3059235 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/module_statistic_bean.py @@ -0,0 +1,98 @@ +import re + +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class ModuleStatisticBean: + TABLE_NAME = Constant.MODULE_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._module_name = name.replace("nn.Module:", "") + pattern = re.compile('_[0-9]+$') + self._module_class = pattern.sub('', name.split("/")[-1]) + self._module_level = name.count("/") + self._base_info = ModuleStatisticInfo(base_data) + self._comparison_info = ModuleStatisticInfo(comparison_data) + + @property + def rows(self): + rows = [self.get_total_row()] + rows.extend(self.get_detail_rows()) + return rows + + @staticmethod + def _get_kernel_detail_rows(base_kernel_dict, com_kernel_dict): + base_kernel_detals = "" + com_kernel_details = "" + for kernel_name, base_dur_list in base_kernel_dict.items(): + base_dur = "%.3f" % sum(base_dur_list) + base_kernel_detals += f"{kernel_name}, [number: {len(base_dur_list)}], [duration_ms: {base_dur}]\n" + for kernel_name, com_dur_list in com_kernel_dict.items(): + com_dur = "%.3f" % sum(com_dur_list) + com_kernel_details += f"{kernel_name}, [number: {len(com_dur_list)}], [duration_ms: {com_dur}]\n" + return [base_kernel_detals, com_kernel_details] + + def get_total_row(self): + total_diff, total_ratio = calculate_diff_ratio(self._base_info.device_total_dur_ms, + self._comparison_info.device_total_dur_ms) + self_diff, _ = calculate_diff_ratio(self._base_info.device_self_dur_ms, + self._comparison_info.device_self_dur_ms) + row = [None, self._module_class, self._module_level, self._module_name, "[ TOTAL ]", None, + self._base_info.device_self_dur_ms, self._base_info.number, self._base_info.device_total_dur_ms, + None, self._comparison_info.device_self_dur_ms, self._comparison_info.number, + self._comparison_info.device_total_dur_ms, total_diff, self_diff, + total_ratio, self._base_info.call_stack, self._comparison_info.call_stack] + return row + + def get_detail_rows(self): + rows = [] + for op_name, base_dur_dict in self._base_info.api_dict.items(): + base_dur_list = base_dur_dict.get("total", []) + com_dur_dict = self._comparison_info.api_dict.pop(op_name, {}) + com_dur_list = com_dur_dict.get("total", []) + base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows(base_dur_dict.get("detail", {}), + com_dur_dict.get("detail", {})) + self_diff, self_ratio = calculate_diff_ratio(sum(base_dur_list), sum(com_dur_list)) + row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, + sum(base_dur_list), len(base_dur_list), None, com_kernel_details, sum(com_dur_list), + len(com_dur_list), None, None, self_diff, self_ratio, None, None] + rows.append(row) + + for op_name, com_dur_dict in self._comparison_info.api_dict.items(): + com_dur_list = com_dur_dict.get("total", []) + base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows({}, com_dur_dict.get("detail", {})) + self_diff, self_ratio = calculate_diff_ratio(0, sum(com_dur_list)) + row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, 0, 0, + None, com_kernel_details, sum(com_dur_list), len(com_dur_list), None, None, self_diff, + self_ratio, None, None] + rows.append(row) + return rows + + +class ModuleStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.device_self_dur_ms = 0 + self.device_total_dur_ms = 0 + self.call_stack = "" + self.number = len(data_list) + self.api_dict = {} + self._get_info() + + def _get_info(self): + if self._data_list: + self.call_stack = self._data_list[0].call_stack + for module in self._data_list: + self.device_self_dur_ms += module.device_self_dur / Constant.US_TO_MS + self.device_total_dur_ms += module.device_total_dur / Constant.US_TO_MS + for torch_op in module.toy_layer_api_list: + self.api_dict.setdefault(torch_op.name, {}).setdefault("total", []).append( + torch_op.device_dur / Constant.US_TO_MS) + for kernel in torch_op.kernel_list: + self.api_dict.setdefault(torch_op.name, {}).setdefault("detail", {}).setdefault(kernel.kernel_name, + []).append( + kernel.device_dur / Constant.US_TO_MS) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/operator_compare_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/operator_compare_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..e7ecfedddd7c2f5dd33664b1556a7b0245e295d1 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/operator_compare_bean.py @@ -0,0 +1,47 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.torch_op_node import TorchOpNode +from compare_backend.utils.tree_builder import TreeBuilder + + +class OperatorCompareBean: + TABLE_NAME = Constant.OPERATOR_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, index: int, base_op: TorchOpNode, comparison_op: TorchOpNode): + self._index = index + self._base_op = OperatorInfo(base_op) + self._comparison_op = OperatorInfo(comparison_op) + + @property + def row(self): + row = [self._index + 1, self._base_op.operator_name, self._base_op.input_shape, self._base_op.input_type, + self._base_op.kernel_details, self._base_op.device_dur, self._comparison_op.operator_name, + self._comparison_op.input_shape, self._comparison_op.input_type, self._comparison_op.kernel_details, + self._comparison_op.device_dur] + diff_fields = calculate_diff_ratio(self._base_op.device_dur, self._comparison_op.device_dur) + row.extend(diff_fields) + return row + + +class OperatorInfo: + def __init__(self, torch_op: TorchOpNode): + self.operator_name = None + self.input_shape = None + self.input_type = None + self.device_dur = 0 + self.kernel_details = "" + self._kernel_list = [] + if torch_op: + self.operator_name = torch_op.name + self.input_shape = torch_op.input_shape + self.input_type = torch_op.input_type + self._kernel_list = TreeBuilder.get_total_kernels(torch_op) + self._update_kernel_fields() + + def _update_kernel_fields(self): + for kernel in self._kernel_list: + self.device_dur += kernel.device_dur + self.kernel_details += kernel.kernel_details diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/operator_statistic_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/operator_statistic_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..457ae55acbd275dcf3e2f3c584114af8b9d55d17 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/operator_statistic_bean.py @@ -0,0 +1,36 @@ +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.tree_builder import TreeBuilder + + +class OperatorStatisticBean: + TABLE_NAME = Constant.OPERATOR_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._name = name + self._base_info = OperatorStatisticInfo(base_data) + self._comparison_info = OperatorStatisticInfo(comparison_data) + + @property + def row(self): + row = [None, self._name, self._base_info.device_dur_ms, self._base_info.number, + self._comparison_info.device_dur_ms, self._comparison_info.number] + diff_fields = calculate_diff_ratio(self._base_info.device_dur_ms, self._comparison_info.device_dur_ms) + row.extend(diff_fields) + return row + + +class OperatorStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.device_dur_ms = 0 + self.number = len(data_list) + self._get_info() + + def _get_info(self): + for op_data in self._data_list: + kernel_list = TreeBuilder.get_total_kernels(op_data) + self.device_dur_ms += sum([kernel.device_dur / Constant.US_TO_MS for kernel in kernel_list]) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/__init__.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/compare_event.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/compare_event.py new file mode 100644 index 0000000000000000000000000000000000000000..463e82430896923a8d21c44ab9e6f9b952855a84 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/compare_event.py @@ -0,0 +1,79 @@ +from decimal import Decimal + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.constant import Constant + + +class KernelEvent: + def __init__(self, event: TraceEventBean, device_type: str): + self._event = event + self._device_type = device_type + + @property + def kernel_name(self) -> str: + return self._event.name + + @property + def device_dur(self) -> float: + return self._event.dur + + @property + def task_id(self) -> int: + return self._event.task_id + + @property + def task_type(self) -> str: + return self._event.task_type + + @property + def kernel_details(self): + if self._device_type == Constant.GPU: + return f"{self.kernel_name} [duration: {self.device_dur}]\n" + return f"{self.kernel_name}, {self.task_id}, {self.task_type} [duration: {self.device_dur}]\n" + + +class MemoryEvent: + def __init__(self, event: dict): + self._event = event + self._name = "" + self._size = 0.0 + self._ts = Decimal(0) + self._release_time = Decimal(0) + self._allocation_time = Decimal(0) + self._duration = 0.0 + self.init() + + @property + def size(self) -> float: + return self._size + + @property + def duration(self) -> float: + return self._duration + + @property + def memory_details(self) -> str: + name = self._event.get(Constant.NAME, "") or self._name + return f"{name}, ({self._allocation_time}, {self._release_time}), " \ + f"[duration: {self._duration}], [size: {self._size}]\n" + + @property + def is_torch_op(self) -> bool: + return False + + @property + def start_time(self) -> Decimal: + return self._ts + + def set_name(self, name: str): + self._name = name + + def init(self): + self._size = self._event.get(Constant.SIZE, 0) + self._ts = self._event.get(Constant.TS, 0) + self._release_time = self._event.get(Constant.RELEASE_TIME) + self._allocation_time = self._event.get(Constant.ALLOCATION_TIME) + if not self._release_time or not self._allocation_time: + self._duration = 0.0 + else: + self._duration = float(self._release_time - self._allocation_time) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..122009b9045074c908c33dc50fffd36f03eb4ff9 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py @@ -0,0 +1,87 @@ +import math + +import pandas as pd + +from compare_backend.utils.common_func import convert_to_float +from compare_backend.utils.constant import Constant + + +class KernelDetailsBean: + def __init__(self, data: dict): + self._data = data + self._op_type = "" + self._name = "" + self._aiv_vec_time = 0.0 + self._mac_time = 0.0 + self._duration = 0.0 + self.init() + + @property + def op_type(self) -> str: + return self._op_type + + @property + def name(self) -> str: + return self._name + + @property + def aiv_vec_time(self) -> float: + if self._aiv_vec_time == "" or self._aiv_vec_time == "N/A": + return float("nan") + return convert_to_float(self._aiv_vec_time) + + @property + def mac_time(self) -> float: + if self._mac_time == "" or self._mac_time == "N/A": + return float("nan") + return convert_to_float(self._mac_time) + + @property + def duration(self) -> float: + return convert_to_float(self._duration) + + def is_hide_op_pmu(self): + if "mac_time(us)" in self._data.keys() or "aiv_vec_time(us)" in self._data.keys(): + return False + return True + + def is_vector(self): + if not pd.isna(self.aiv_vec_time) and self.aiv_vec_time > 0: + return True + if not pd.isna(self.mac_time) and math.isclose(self.mac_time, 0.0): + return True + return False + + def is_invalid(self): + if pd.isna(self.aiv_vec_time) and pd.isna(self.mac_time): + return True + return False + + def is_fa_bwd(self): + return 'bwd' in self.op_type.lower() or 'grad' in self.op_type.lower() + + def is_sdma(self): + return self.name.lower().startswith("aclnninplacecopy") and "tensormove" in self.name.lower() + + def is_flash_attention(self): + return "flashattention" in self.op_type.lower() + + def is_cube(self): + return "matmul" in self.op_type.lower() + + def is_conv(self): + return self.op_type.lower().startswith("conv") + + def is_conv_bwd(self): + lower_op_type = self.op_type.lower() + return any(bwd in lower_op_type for bwd in Constant.BWD_LIST) + + def is_page_attention(self): + return "pagedattention" in self.op_type.lower() + + def init(self): + self._op_type = self._data.get('Type', "") + self._name = self._data.get('Name', "") + self._aiv_vec_time = self._data.get('aiv_vec_time(us)', "") + self._mac_time = self._data.get('mac_time(us)', "") + self._duration = self._data.get('Duration(us)', 0) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/memory_record_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/memory_record_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..50d14089fe95f2dbc8e97788d80e0644306f671e --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/memory_record_bean.py @@ -0,0 +1,15 @@ +from compare_backend.utils.common_func import convert_to_float + + +class MemoryRecordBean: + def __init__(self, data: dict): + self._data = data + self._total_reserved_mb = 0.0 + self.init() + + @property + def total_reserved_mb(self) -> float: + return convert_to_float(self._total_reserved_mb) + + def init(self): + self._total_reserved_mb = self._data.get("Total Reserved(MB)", 0) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..254b8629cdc1941ac46da9b47419a4c675718375 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py @@ -0,0 +1,43 @@ +from decimal import Decimal + +from compare_backend.utils.common_func import convert_to_float, convert_to_decimal + + +class OperatorMemoryBean: + + def __init__(self, data: dict): + self._data = data + self._name = "" + self._size = 0.0 + self._allocation_time = Decimal(0) + self._release_time = Decimal(0) + self.init() + + @property + def name(self) -> str: + return self._name + + @property + def size(self) -> float: + return convert_to_float(self._size) + + @property + def allocation_time(self) -> Decimal: + if not self._allocation_time: + return Decimal(0) + return convert_to_decimal(self._allocation_time) + + @property + def release_time(self) -> Decimal: + if not self._release_time: + return Decimal(0) + return convert_to_decimal(self._release_time) + + def init(self): + self._name = self._data.get("Name", "") + self._size = self._data.get("Size(KB)", 0) + self._allocation_time = self._data.get("Allocation Time(us)", 0) + self._release_time = self._data.get("Release Time(us)", 0) + + def is_cann_op(self): + return "cann::" in self._name diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..cef6bb071243264c792e74f562e058ca1d8df7a1 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py @@ -0,0 +1,216 @@ +from decimal import Decimal + +from compare_backend.utils.common_func import convert_to_float, convert_to_decimal +from compare_backend.utils.constant import Constant + + +class TraceEventBean: + + def __init__(self, event: dict): + self._event = event + self._pid = 0 + self._tid = 0 + self._ts = Decimal(0) + self._dur = 0.0 + self._ph = "" + self._cat = "" + self._name = "" + self._args = {} + self._is_torch_op = False + self.init() + + @property + def pid(self) -> int: + return self._pid + + @property + def tid(self) -> int: + return self._tid + + @property + def dur(self) -> float: + return convert_to_float(self._dur) + + @property + def start_time(self) -> Decimal: + return convert_to_decimal(self._ts) + + @property + def end_time(self) -> Decimal: + return self.start_time + convert_to_decimal(self._dur) + + @property + def name(self) -> str: + return self._name + + @property + def lower_name(self) -> str: + return self._name.lower() + + @property + def lower_cat(self) -> str: + return self._cat.lower() + + @property + def args(self) -> dict: + return self._args + + @property + def id(self) -> str: + return self._event.get("id") + + @property + def stream_id(self) -> int: + return self._args.get('Stream Id') + + @property + def stream(self) -> int: + return self._args.get("stream") + + @property + def task_type(self) -> str: + return self._args.get('Task Type') + + @property + def task_id(self) -> int: + return self._args.get('Task Id') + + @property + def device_id(self) -> int: + try: + return int(self._args.get('Device Id', Constant.INVALID_VALUE)) + except Exception: + return Constant.INVALID_VALUE + + @property + def total_reserved(self): + return self._args.get('Total Reserved', 0) + + @property + def corr_id(self) -> int: + return self._args.get('correlation_id') + + @property + def process_name(self) -> int: + return self._args.get("name", "") + + @property + def bytes_kb(self) -> int: + return self._args.get("Bytes", 0) / Constant.BYTE_TO_KB + + @property + def addr(self) -> str: + return self._args.get("Addr") + + @property + def event(self) -> dict: + return self._event + + @property + def is_torch_op(self) -> bool: + return self._is_torch_op + + @is_torch_op.setter + def is_torch_op(self, value: bool): + self._is_torch_op = value + + def is_m_mode(self) -> bool: + return self._ph == "M" + + def is_x_mode(self) -> bool: + return self._ph == "X" + + def is_flow_start(self) -> bool: + return self._ph == "s" + + def is_flow_end(self) -> bool: + return self._ph == "f" + + def is_enqueue(self) -> bool: + return self.lower_cat == "enqueue" + + def is_dequeue(self) -> bool: + return self.lower_cat == "dequeue" + + def is_process_meta(self) -> bool: + return self.is_m_mode() and self._name == "process_name" + + def is_thread_meta(self) -> bool: + return self.is_m_mode() and self._name == "thread_name" + + def is_communication_op_thread(self) -> bool: + return self._args.get("name", "").find("Communication") != -1 + + def is_hccl_process_name(self) -> bool: + return self.process_name == "HCCL" + + def is_overlap_process_name(self) -> bool: + return self.process_name == "Overlap Analysis" + + def is_npu_process_name(self) -> bool: + return self.process_name == "Ascend Hardware" + + def is_computing_event(self): + return self._name == "Computing" + + def is_comm_not_overlap(self): + return self._name == 'Communication(Not Overlapped)' + + def is_dict(self): + return isinstance(self._event, dict) + + def is_kernel_cat(self): + return self.lower_cat == "kernel" + + def is_nccl_name(self): + return self.lower_name.startswith("nccl") + + def is_kernel_except_nccl(self): + return self.is_kernel_cat() and not self.is_nccl_name() + + def is_memory_event(self): + return self.lower_name == '[memory]' and self.device_id >= 0 + + def is_compute_event(self): + return self.task_type in ('AI_CORE', 'MIX_AIC', 'MIX_AIV', 'AI_CPU', 'AI_VECTOR_CORE', 'FFTS_PLUS') + + def is_sdma_event(self): + return self.task_type in ('SDMA_SQE', 'PCIE_DMA_SQE') + + def is_event_wait(self): + return self.task_type == 'EVENT_WAIT_SQE' + + def is_backward(self): + return any(bwd in self.lower_name for bwd in Constant.BWD_LIST) + + def is_python_function(self): + return self.lower_cat == "python_function" + + def is_optimizer(self): + return self.lower_name.startswith("optimizer") + + def is_fwdbwd(self): + return self.lower_cat == "fwdbwd" + + def is_step_profiler(self): + return self.name.find("ProfilerStep#") != -1 + + def reset_name(self, name): + self._name = name + + def is_conv(self): + return self.name.lower().startswith("aten::conv") + + def is_lccl(self): + return self.lower_name == "kernel_aivec" + + def init(self): + if isinstance(self._event, dict): + self._pid = self._event.get("pid", 0) + self._tid = self._event.get("tid", 0) + self._ts = self._event.get("ts", 0) + self._dur = self._event.get("dur", 0) + self._ph = self._event.get("ph", "") + self._cat = self._event.get("cat", "") + self._name = self._event.get("name", "") + self._args = self._event.get("args", {}) diff --git a/profiler/compare_tools_review/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools_review/compare_backend/compare_bean/profiling_info.py new file mode 100644 index 0000000000000000000000000000000000000000..e5d9bf26e985330d830ba6e01f62525fe88e43ea --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/compare_bean/profiling_info.py @@ -0,0 +1,128 @@ +from compare_backend.utils.constant import Constant + + +class ProfilingInfo: + TABLE_NAME = Constant.PERFORMANCE_TABLE + HEADERS = [] + OVERHEAD = [] + + def __init__(self, profiling_type: str): + self.profiling_type = profiling_type + self.cube_time = 0.0 + self.other_time = 0.0 + self.vec_time = 0.0 + self.cube_num = 0 + self.vec_num = 0 + self.sdma_num = 0 + self.fa_num_fwd = 0 + self.fa_num_bwd = 0 + self.pa_num = 0 + self.lccl_num = 0 + self.conv_time_fwd = 0.0 + self.conv_time_bwd = 0.0 + self.conv_num_fwd = 0 + self.conv_num_bwd = 0 + self.compute_time = 0.0 + self.communication_not_overlapped = 0.0 + self.wait_time = 0.0 + self.memory_used = 0.0 + self.e2e_time = 0.0 + self.sdma_time = 0.0 + self.scheduling_time = 0.0 + self.fa_time_bwd = 0.0 + self.pa_time = 0.0 + self.lccl_time = 0.0 + self.fa_time_fwd = 0.0 + self.minimal_profiling = False + self.hide_op_details = False + self.is_level0 = False + + def trans_time_to_s(self): + self.cube_time = self.cube_time / 10 ** 6 + self.other_time = self.other_time / 10 ** 6 + self.vec_time = self.vec_time / 10 ** 6 + self.compute_time = self.compute_time / 10 ** 6 + self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6 + self.wait_time = self.wait_time / 10 ** 6 + self.e2e_time = self.e2e_time / 10 ** 6 + self.sdma_time = self.sdma_time / 10 ** 6 + self.scheduling_time = self.scheduling_time / 10 ** 6 + self.fa_time_bwd = self.fa_time_bwd / 10 ** 6 + self.fa_time_fwd = self.fa_time_fwd / 10 ** 6 + self.pa_time = self.pa_time / 10 ** 6 + self.lccl_time = self.lccl_time / 10 ** 6 + self.conv_time_fwd = self.conv_time_fwd / 10 ** 6 + self.conv_time_bwd = self.conv_time_bwd / 10 ** 6 + + def calculate_other_time(self): + self.other_time = max( + [0, self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd - + self.pa_time - self.vec_time - self.conv_time_fwd - self.conv_time_bwd]) + + def calculate_vec_time(self): + self.vec_time = self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd \ + - self.conv_time_fwd - self.conv_time_bwd + + def calculate_schedule_time(self): + self.scheduling_time = (self.e2e_time - self.compute_time - self.lccl_time \ + - self.communication_not_overlapped) + + def update_fa_fwd_info(self, time: float): + self.fa_time_fwd += time + self.fa_num_fwd += 1 + + def update_fa_bwd_info(self, time: float): + self.fa_time_bwd += time + self.fa_num_bwd += 1 + + def update_pa_info(self, time: float): + self.pa_time += time + self.pa_num += 1 + + def update_lccl_info(self, time: float): + self.lccl_time += time + self.lccl_num += 1 + + def update_conv_fwd_info(self, time: float): + self.conv_time_fwd += time + self.conv_num_fwd += 1 + + def update_conv_bwd_info(self, time: float): + self.conv_time_bwd += time + self.conv_num_bwd += 1 + + def update_sdma_info(self, time: float, num: int = 1): + self.sdma_time += time + self.sdma_num += num + + def update_cube_info(self, time: float): + self.cube_time += time + self.cube_num += 1 + + def update_vec_info(self, time: float): + self.vec_time += time + self.vec_num += 1 + + def set_compute_time(self, time: float): + self.compute_time = time + + def update_compute_time(self, time: float): + self.compute_time += time + + def set_e2e_time(self, time: float): + self.e2e_time = time + + def set_comm_not_overlap(self, time: float): + self.communication_not_overlapped = time + + def update_comm_not_overlap(self, time: float): + self.communication_not_overlapped += time + + def update_comm_not_overlap_wait_time(self, time: float): + self.wait_time = time + + def set_memory_used(self, memory: float): + self.memory_used = memory + + def is_not_minimal_profiling(self) -> bool: + return self.profiling_type == Constant.NPU and not self.minimal_profiling diff --git a/profiler/compare_tools_review/compare_backend/comparison_generator.py b/profiler/compare_tools_review/compare_backend/comparison_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..b07170b648c44f8061fb1482bdd5d2d417cbcfaf --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/comparison_generator.py @@ -0,0 +1,44 @@ +from compare_backend.generator.detail_performance_generator import DetailPerformanceGenerator +from compare_backend.generator.overall_performance_generator import OverallPerformanceGenerator +from compare_backend.interface.overall_interface import OverallInterface +from compare_backend.profiling_parser.gpu_profiling_parser import GPUProfilingParser +from compare_backend.profiling_parser.npu_profiling_parser import NPUProfilingParser +from compare_backend.utils.constant import Constant +from compare_backend.utils.args_manager import ArgsManager + + +class ComparisonGenerator: + PARSER_DICT = {Constant.NPU: NPUProfilingParser, Constant.GPU: GPUProfilingParser} + INTERFACE_DICT = {Constant.OVERALL_COMPARE: OverallInterface} + + def __init__(self, args): + self._args_manager = ArgsManager() + self._args_manager.init(args) + self._data_dict = {} + + def run(self): + self.load_data() + self.generate_compare_result() + + def load_data(self): + self._data_dict[Constant.BASE_DATA] = self.PARSER_DICT.get(self._args_manager.base_profiling_type)( + self._args_manager.args, self._args_manager.base_path_dict).load_data() + self._data_dict[Constant.COMPARISON_DATA] = self.PARSER_DICT.get(self._args_manager.comparison_profiling_type)( + self._args_manager.args, self._args_manager.comparison_path_dict).load_data() + + def generate_compare_result(self): + overall_data = {Constant.BASE_DATA: self._data_dict.get(Constant.BASE_DATA).overall_metrics, + Constant.COMPARISON_DATA: self._data_dict.get(Constant.COMPARISON_DATA).overall_metrics} + generator_list = [OverallPerformanceGenerator(overall_data, self._args_manager.args), + DetailPerformanceGenerator(self._data_dict, self._args_manager.args)] + for generator in generator_list: + generator.start() + for generator in generator_list: + generator.join() + + def run_interface(self, compare_type: str) -> dict: + self.load_data() + interface = self.INTERFACE_DICT.get(compare_type) + if interface: + return interface(self._data_dict).run() + return {} diff --git a/profiler/compare_tools_review/compare_backend/data_prepare/__init__.py b/profiler/compare_tools_review/compare_backend/data_prepare/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/data_prepare/module_data_prepare.py b/profiler/compare_tools_review/compare_backend/data_prepare/module_data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..84932366dd9252bf2df068a6d9cc1cf1d0f9c440 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/data_prepare/module_data_prepare.py @@ -0,0 +1,99 @@ +import copy +from queue import Queue + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.constant import Constant +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.tree_builder import TreeBuilder + + +class ModuleDataPrepare: + def __init__(self, profiling_data: ProfilingResult): + self.profiling_data = profiling_data + self._nn_module_list = [] + self._call_function = [] + for event in profiling_data.python_function_data: + if event.lower_name.startswith("nn.module:"): + self._nn_module_list.append(event) + else: + self._call_function.append(event) + self._bwd_dict = {} + self._bwd_pid = self._get_bwd_pid() + + @staticmethod + def update_module_node_info(fwd_root_node, bwd_root_node, func_root_node): + queue = Queue() + queue.put(fwd_root_node) + queue.put(bwd_root_node) + while not queue.empty(): + module_node = queue.get() + module_node.update_torch_op_kernel_list() + call_function = func_root_node.find_module_call(module_node.start_time) + if call_function: + module_node.reset_call_stack(call_function.call_stack) + for sub_module_node in module_node.child_nodes: + queue.put(sub_module_node) + + def build_module_tree(self): + if not self._nn_module_list: + return [None, None] + self._dispatch_torch_op() + event_list = [TraceEventBean({"ts": ts}) for ts in self.profiling_data.kernel_dict.keys()] + self._nn_module_list.extend(event_list) + root_node = TreeBuilder.build_module_tree(self._nn_module_list, self.profiling_data.kernel_dict) + func_root_node = TreeBuilder.build_module_tree(self._call_function, {}) + bwd_module_list = self.get_bwd_module(root_node) + if bwd_module_list: + bwd_module_list.extend(event_list) + bwd_root_node = TreeBuilder.build_module_tree(bwd_module_list, self.profiling_data.kernel_dict) + self.match_torch_op(root_node, bwd_root_node) + self.update_module_node_info(root_node, bwd_root_node, func_root_node) + return [root_node, bwd_root_node] + + def get_bwd_module(self, root_node: ModuleNode): + bwd_module_list = [] + for flow in self.profiling_data.fwdbwd_dict.values(): + start_point = flow.get("start") + end_point = flow.get("end") + if not start_point or not end_point: + continue + end_event = self._bwd_dict.get(end_point.start_time) + if not end_event: + continue + call_module = root_node.find_module_call(start_point.start_time) + if call_module: + bwd_event = copy.deepcopy(end_event) + bwd_event.reset_name(f"[ BACKWARD ]{call_module.module_name}") + bwd_module_list.append(bwd_event) + return bwd_module_list + + def match_torch_op(self, fwd_root_node, bwd_root_node): + torch_op_list = sorted(self.profiling_data.torch_op_data, key=lambda x: x.start_time) + for torch_op in torch_op_list: + if torch_op.is_optimizer(): + continue + if torch_op.is_step_profiler(): + continue + matched_module = fwd_root_node.find_module_call(torch_op.start_time) + if matched_module: + matched_module.find_torch_op_call(torch_op) + continue + matched_module = bwd_root_node.find_module_call(torch_op.start_time) + if matched_module: + matched_module.find_torch_op_call(torch_op) + + def _dispatch_torch_op(self): + for torch_op in self.profiling_data.torch_op_data: + if torch_op.is_optimizer(): + self._nn_module_list.append(torch_op) + continue + if torch_op.pid == self._bwd_pid: + self._bwd_dict[torch_op.start_time] = torch_op + + def _get_bwd_pid(self): + for flow in self.profiling_data.fwdbwd_dict.values(): + end_point = flow.get("end") + if end_point: + return end_point.pid + return Constant.INVALID_VALUE diff --git a/profiler/compare_tools_review/compare_backend/data_prepare/operator_data_prepare.py b/profiler/compare_tools_review/compare_backend/data_prepare/operator_data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..fdce23c6ab4ff7f9f6f7d6bc1442063c57cb6098 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/data_prepare/operator_data_prepare.py @@ -0,0 +1,19 @@ +from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.tree_builder import TreeBuilder + + +class OperatorDataPrepare: + def __init__(self, profiling_data: ProfilingResult): + self.profiling_data = profiling_data + + def get_top_layer_ops(self) -> any: + root_node = TreeBuilder.build_tree(self.profiling_data.torch_op_data, self.profiling_data.kernel_dict, + self.profiling_data.memory_list) + level1_child_nodes = root_node.child_nodes + result_data = [] + for level1_node in level1_child_nodes: + if level1_node.is_step_profiler(): + result_data.extend(level1_node.child_nodes) + else: + result_data.append(level1_node) + return result_data diff --git a/profiler/compare_tools_review/compare_backend/disaggregate/__init__.py b/profiler/compare_tools_review/compare_backend/disaggregate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/disaggregate/overall_perf_interface.py b/profiler/compare_tools_review/compare_backend/disaggregate/overall_perf_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..c89e84519302781a590523bc7fdaaf9e1254acf5 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/disaggregate/overall_perf_interface.py @@ -0,0 +1,34 @@ +from common_func.path_manager import PathManager +from compare_backend.profiling_parser.gpu_profiling_parser import GPUProfilingParser +from compare_backend.profiling_parser.npu_profiling_parser import NPUProfilingParser +from compare_backend.utils.args_manager import ArgsManager +from compare_backend.utils.compare_args import Args +from compare_backend.utils.constant import Constant + + +class OverallPerfInterface: + PARSER_DICT = {Constant.NPU: NPUProfilingParser, Constant.GPU: GPUProfilingParser} + + def __init__(self, profiling_path: str): + self._profiling_path = profiling_path + self._profiling_path_dict = {} + self._result_data = {} + + def run(self): + self._check_path() + self._load_data() + self._generate_result() + return self._result_data + + def _check_path(self): + profiling_path = PathManager.get_realpath(self._profiling_path) + self._profiling_path_dict = ArgsManager().parse_profiling_path(profiling_path) + + def _load_data(self): + args = Args(enable_profiling_compare=True) + profiling_type = self._profiling_path_dict.get(Constant.PROFILING_TYPE, Constant.NPU) + self._profiling_data = self.PARSER_DICT.get(profiling_type)(args, self._profiling_path_dict).load_data() + + def _generate_result(self): + overall_data = self._profiling_data.overall_metrics + self._result_data = getattr(overall_data, "__dict__", {}) diff --git a/profiler/compare_tools_review/compare_backend/generator/__init__.py b/profiler/compare_tools_review/compare_backend/generator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/generator/base_generator.py b/profiler/compare_tools_review/compare_backend/generator/base_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..e77071b5998a9915d09c54f8b4c811d434555167 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/generator/base_generator.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +from collections import OrderedDict +from multiprocessing import Process + + +class BaseGenerator(Process, ABC): + def __init__(self, profiling_data_dict: dict, args: any): + super(BaseGenerator, self).__init__() + self._profiling_data_dict = profiling_data_dict + self._args = args + self._result_data = OrderedDict() + + def run(self): + self.compare() + self.generate_view() + + @abstractmethod + def compare(self): + raise NotImplementedError("Function compare need to be implemented.") + + @abstractmethod + def generate_view(self): + raise NotImplementedError("Function generate_view need to be implemented.") diff --git a/profiler/compare_tools_review/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools_review/compare_backend/generator/detail_performance_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..88ff7ac19699f7d08e61902088108c63fa78a6bf --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/generator/detail_performance_generator.py @@ -0,0 +1,145 @@ +import os +from collections import deque +from datetime import datetime +from queue import Queue + +from compare_backend.comparator.communication_comparator import CommunicationComparator +from compare_backend.comparator.module_comparetor import ModuleComparator +from compare_backend.comparator.module_statistic_comparator import ModuleStatisticComparator +from compare_backend.comparator.operator_comparator import OperatorComparator +from compare_backend.comparator.operator_statistic_comparator import OperatorStatisticComparator +from compare_backend.compare_bean.communication_bean import CommunicationBean +from compare_backend.compare_bean.memory_compare_bean import MemoryCompareBean +from compare_backend.compare_bean.memory_statistic_bean import MemoryStatisticBean +from compare_backend.compare_bean.module_compare_bean import ModuleCompareBean +from compare_backend.compare_bean.module_statistic_bean import ModuleStatisticBean +from compare_backend.compare_bean.operator_compare_bean import OperatorCompareBean +from compare_backend.compare_bean.operator_statistic_bean import OperatorStatisticBean +from compare_backend.data_prepare.module_data_prepare import ModuleDataPrepare +from compare_backend.data_prepare.operator_data_prepare import OperatorDataPrepare +from compare_backend.generator.base_generator import BaseGenerator +from compare_backend.utils.common_func import longest_common_subsequence_matching +from compare_backend.utils.constant import Constant +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.name_function import NameFunction +from compare_backend.utils.torch_op_node import TorchOpNode +from compare_backend.view.excel_view import ExcelView + + +class DetailPerformanceGenerator(BaseGenerator): + def __init__(self, profiling_data_dict: dict, args: any): + super().__init__(profiling_data_dict, args) + + def compare(self): + if self._args.enable_operator_compare or self._args.enable_memory_compare or \ + self._args.enable_communication_compare: + print("[INFO] Start to compare performance detail data, please wait.") + comparator_list = self._create_comparator() + for comparator in comparator_list: + self._result_data.update(comparator.generate_data()) + + def generate_view(self): + if not self._result_data: + return + dir_path = self._args.output_path if self._args.output_path else "./" + file_name = "performance_comparison_result_{}.xlsx".format(datetime.utcnow().strftime("%Y%m%d%H%M%S")) + result_file_path = os.path.realpath(os.path.join(dir_path, file_name)) + ExcelView(self._result_data, result_file_path, self._args).generate_view() + print(f"[INFO] The comparison result file has been generated: {result_file_path}") + + def _create_comparator(self): + comparator_list = [] + + op_compare_result = [] + if self._args.enable_operator_compare: + module_compare_result = self.match_nn_module() if self._profiling_data_dict.get( + Constant.BASE_DATA).python_function_data and self._profiling_data_dict.get( + Constant.COMPARISON_DATA).python_function_data else [] + if not module_compare_result: + op_compare_result = self.match_torch_op() + + if self._args.enable_memory_compare and not op_compare_result: + op_compare_result = self.match_torch_op() + + if self._args.enable_communication_compare: + communication_data = { + Constant.BASE_DATA: self._profiling_data_dict.get(Constant.BASE_DATA).communication_dict, + Constant.COMPARISON_DATA: self._profiling_data_dict.get(Constant.COMPARISON_DATA).communication_dict} + comparator_list.append(CommunicationComparator(communication_data, CommunicationBean)) + + if self._args.enable_operator_compare: + if module_compare_result: + comparator_list.append(ModuleStatisticComparator(module_compare_result, ModuleStatisticBean)) + comparator_list.append(ModuleComparator(module_compare_result, ModuleCompareBean)) + else: + comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean)) + comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean)) + if self._args.enable_memory_compare: + comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean)) + comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean)) + return comparator_list + + def match_torch_op(self) -> list: + base_ops = OperatorDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).get_top_layer_ops() + comparison_ops = OperatorDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).get_top_layer_ops() + if not base_ops and not comparison_ops: + return [] + name_func = NameFunction(self._args).get_name_func() + op_compare_result = longest_common_subsequence_matching(base_ops, comparison_ops, name_func) + if self._args.max_kernel_num is not None: + op_compare_result = self._drill_down(op_compare_result, name_func) + return op_compare_result + + def _drill_down(self, compare_result_data: list, name_func: any) -> list: + drill_down_result = [] + compare_result_data.reverse() + op_deque = deque(compare_result_data) + while op_deque: + match_data = op_deque.pop() + base_op = match_data[0] if match_data[0] else TorchOpNode() + comparison_op = match_data[1] if match_data[1] else TorchOpNode() + if not base_op.child_nodes or not comparison_op.child_nodes: + drill_down_result.append(match_data) + continue + if max(base_op.kernel_num, comparison_op.kernel_num) <= self._args.max_kernel_num: + drill_down_result.append(match_data) + continue + match_list = longest_common_subsequence_matching(base_op.child_nodes, comparison_op.child_nodes, name_func) + match_list.reverse() + for data in match_list: + op_deque.append(data) + + return drill_down_result + + def match_nn_module(self) -> list: + module_compare_result = [] + base_root_node = ModuleDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).build_module_tree() + comparison_root_node = ModuleDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).build_module_tree() + for index, base_node in enumerate(base_root_node): + comparison_node = comparison_root_node[index] if index < len(comparison_root_node) else None + if not base_node or not comparison_node: + continue + module_compare_result.extend(self._matching_all_modules(base_node, comparison_node)) + return module_compare_result + + def _matching_all_modules(self, base_node: ModuleNode, comparison_node: ModuleNode): + all_matched_modules = [] + matched_queue = Queue() + matched_queue.put([base_node, comparison_node]) + while not matched_queue.empty(): + matched_base_node, matched_comparison_node = matched_queue.get() + matched_node_list = self._matching_common_subsequence(matched_base_node, matched_comparison_node) + all_matched_modules.extend(matched_node_list) + for matched_node in matched_node_list: + matched_queue.put(matched_node) + return all_matched_modules + + def _matching_common_subsequence(self, base_node: ModuleNode, comparison_node: ModuleNode): + base_modules = base_node.child_nodes if base_node else [] + comparison_modules = comparison_node.child_nodes if comparison_node else [] + if not base_modules and not comparison_modules: + return [] + name_func = NameFunction(self._args).get_module_name + return longest_common_subsequence_matching(base_modules, comparison_modules, name_func) diff --git a/profiler/compare_tools_review/compare_backend/generator/overall_performance_generator.py b/profiler/compare_tools_review/compare_backend/generator/overall_performance_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..9fe31d0ea54b5e4eaa38239a7b825d1b2e80b00f --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/generator/overall_performance_generator.py @@ -0,0 +1,19 @@ +from compare_backend.comparator.overall_performance_comparator import OverallPerformanceComparator +from compare_backend.compare_bean.profiling_info import ProfilingInfo +from compare_backend.generator.base_generator import BaseGenerator +from compare_backend.view.screen_view import ScreenView + + +class OverallPerformanceGenerator(BaseGenerator): + def __init__(self, profiling_data_dict: dict, args: any): + super().__init__(profiling_data_dict, args) + + def compare(self): + if not self._args.enable_profiling_compare: + return + self._result_data = OverallPerformanceComparator(self._profiling_data_dict, ProfilingInfo).generate_data() + + def generate_view(self): + if not self._result_data: + return + ScreenView(self._result_data).generate_view() diff --git a/profiler/compare_tools_review/compare_backend/interface/__init__.py b/profiler/compare_tools_review/compare_backend/interface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/interface/overall_interface.py b/profiler/compare_tools_review/compare_backend/interface/overall_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..fb549007f634610d1c954ef132c416a5c2606541 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/interface/overall_interface.py @@ -0,0 +1,13 @@ +from compare_backend.comparator.overall_performance_comparator import OverallPerformanceComparator +from compare_backend.compare_bean.profiling_info import ProfilingInfo +from compare_backend.utils.constant import Constant + + +class OverallInterface: + def __init__(self, overall_data: dict): + self._overall_data = overall_data + + def run(self): + data = {Constant.BASE_DATA: self._overall_data.get(Constant.BASE_DATA).overall_metrics, + Constant.COMPARISON_DATA: self._overall_data.get(Constant.COMPARISON_DATA).overall_metrics} + return OverallPerformanceComparator(data, ProfilingInfo).generate_data() diff --git a/profiler/compare_tools_review/compare_backend/profiling_parser/__init__.py b/profiler/compare_tools_review/compare_backend/profiling_parser/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/profiling_parser/base_profiling_parser.py b/profiler/compare_tools_review/compare_backend/profiling_parser/base_profiling_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..2127ff5e75e23e98f0debb0dfdafbeb01930c082 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/profiling_parser/base_profiling_parser.py @@ -0,0 +1,211 @@ +from abc import abstractmethod, ABC +from decimal import Decimal + +from compare_backend.compare_bean.origin_data_bean.compare_event import KernelEvent, MemoryEvent +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.compare_bean.profiling_info import ProfilingInfo +from compare_backend.utils.constant import Constant +from compare_backend.utils.file_reader import FileReader + + +class ProfilingResult: + + def __init__(self, profiling_type): + self._profiling_type = profiling_type + self.torch_op_data = [] + self.kernel_dict = {} + self.memory_list = [] + self.communication_dict = {} + self.overall_metrics = ProfilingInfo(profiling_type) + self.python_function_data = [] + self.fwdbwd_dict = {} + + def update_torch_op_data(self, event: TraceEventBean): + event.is_torch_op = True + self.torch_op_data.append(event) + + def update_python_function_data(self, event: TraceEventBean): + self.python_function_data.append(event) + + def update_fwdbwd_data(self, flow_type: str, event: TraceEventBean): + self.fwdbwd_dict.setdefault(event.id, {})[flow_type] = event + + def update_kernel_dict(self, start_time: Decimal, kernel_event: TraceEventBean): + self.kernel_dict.setdefault(start_time, []).append(KernelEvent(kernel_event, self._profiling_type)) + + def update_memory_list(self, memory_data: dict): + self.memory_list.append(MemoryEvent(memory_data)) + + def update_communication_dict(self, comm_name: str, comm_dur: float): + self.communication_dict.setdefault(comm_name, {}).setdefault("comm_list", []).append(comm_dur) + + def update_comm_task_data(self, comm_name: str, task_event: TraceEventBean): + self.communication_dict.setdefault(comm_name, {}).setdefault("comm_task", {}).setdefault( + task_event.name, []).append(task_event.dur) + + +class BaseProfilingParser(ABC): + + def __init__(self, args: any, path_dict: dict): + self._args = args + self._profiling_type = path_dict.get(Constant.PROFILING_TYPE) + self._profiling_path = path_dict.get(Constant.PROFILING_PATH) + self._json_path = path_dict.get(Constant.TRACE_PATH) + self._trace_events = [] if self._profiling_path == Constant.NPU else {} + self._enable_profiling_compare = args.enable_profiling_compare + self._enable_operator_compare = args.enable_operator_compare + self._enable_memory_compare = args.enable_memory_compare + self._enable_communication_compare = args.enable_communication_compare + self._dispatch_func = self._get_dispatch_func() + self._result_data = ProfilingResult(self._profiling_type) + self._memory_events = [] + self._flow_dict = {} + self._fwdbwd_dict = {} + self._all_kernels = {} + self._comm_task_list = [] + self._comm_list = [] + self._read_trace_event() + self._cur_func_index = 0 + + @abstractmethod + def _update_memory_list(self): + raise NotImplementedError("Function _update_memory_list need to be implemented.") + + @abstractmethod + def _update_overall_metrics(self): + raise NotImplementedError("Function _update_overall_metrics need to be implemented.") + + @abstractmethod + def _is_kernel_event(self, event: TraceEventBean): + raise NotImplementedError("Function _is_kernel_event need to be implemented.") + + @abstractmethod + def _is_flow_event(self, event: TraceEventBean): + raise NotImplementedError("Function _is_flow_event need to be implemented.") + + @abstractmethod + def _is_torch_op_event(self, event: TraceEventBean): + raise NotImplementedError("Function _is_torch_op_event need to be implemented.") + + @abstractmethod + def _get_dispatch_func(self): + raise NotImplementedError("Function _get_dispatch_func need to be implemented.") + + def load_data(self) -> ProfilingResult: + self._dispatch_events() + self._update_kernel_dict() + self._update_communication_dict() + if self._enable_memory_compare: + self._update_memory_list() + if self._enable_profiling_compare: + self._update_overall_metrics() + self._check_result_data() + return self._result_data + + def _dispatch_events(self): + if not self._dispatch_func: + return + index_list = list(range(0, len(self._dispatch_func))) * 2 + for event in self._trace_events: + if not event.is_dict(): + continue + if event.is_m_mode(): + continue + self.__picking_event(event, index_list) + + def __picking_event(self, event: TraceEventBean, index_list: list): + for index in range(self._cur_func_index, self._cur_func_index + len(self._dispatch_func)): + func_index = index_list[index] + res = self._dispatch_func[func_index](event) + if res: + self._cur_func_index = func_index + break + + def _picking_torch_op_event(self, event: TraceEventBean): + if self._is_torch_op_event(event): + self._result_data.update_torch_op_data(event) + return True + return False + + def _picking_kernel_event(self, event: TraceEventBean): + if self._is_kernel_event(event): + self._all_kernels[f"{event.pid}-{event.tid}-{event.start_time}"] = event + return True + return False + + def _picking_flow_event(self, event: TraceEventBean): + if self._is_flow_event(event): + if event.is_flow_start(): + self._flow_dict.setdefault(event.id, {})["start"] = event + elif event.is_flow_end(): + self._flow_dict.setdefault(event.id, {})["end"] = event + return True + return False + + def _picking_python_function_event(self, event: TraceEventBean): + if event.is_python_function(): + self._result_data.update_python_function_data(event) + return True + return False + + def _picking_fwdbwd_flow_event(self, event: TraceEventBean): + if event.is_fwdbwd(): + if event.is_flow_start(): + self._result_data.update_fwdbwd_data("start", event) + elif event.is_flow_end(): + self._result_data.update_fwdbwd_data("end", event) + return True + return False + + def _update_kernel_dict(self): + if self._profiling_type == Constant.NPU: + for comm in self._comm_list: + self._all_kernels[f"{comm.pid}-{comm.tid}-{comm.start_time}"] = comm + for flow_event in self._flow_dict.values(): + start_event = flow_event.get("start") + end_event = flow_event.get("end") + if not start_event or not end_event: + continue + kernel_event = self._all_kernels.get(f"{end_event.pid}-{end_event.tid}-{end_event.start_time}") + if not kernel_event: + continue + self._result_data.update_kernel_dict(start_event.start_time, kernel_event) + + def _update_communication_dict(self): + if self._profiling_type == Constant.GPU: + self._comm_list = list(filter(lambda x: x.is_nccl_name(), self._all_kernels.values())) + self._comm_list.sort(key=lambda x: x.start_time) + self._comm_task_list.sort(key=lambda x: x.start_time) + task_index = 0 + for communication_op in self._comm_list: + name_list = communication_op.lower_name.split("_") + if len(name_list) < 2: + continue + comm_name = name_list[1] + self._result_data.update_communication_dict(comm_name, communication_op.dur) + while task_index < len(self._comm_task_list): + task_event = self._comm_task_list[task_index] + if task_event.start_time < communication_op.start_time: + task_index += 1 + continue + if task_event.start_time > communication_op.end_time: + break + self._result_data.update_comm_task_data(comm_name, task_event) + task_index += 1 + + def _check_result_data(self): + if self._enable_operator_compare or self._enable_memory_compare: + if not self._result_data.torch_op_data: + print(f"[WARNING] Can't find any torch op in the file: {self._profiling_path}") + if self._enable_operator_compare and not self._result_data.kernel_dict: + print(f"[WARNING] Can't find any flow event in the file: {self._profiling_path}") + if self._enable_memory_compare and not self._result_data.memory_list: + print(f"[WARNING] Can't find any memory event in the file: {self._profiling_path}") + if self._enable_communication_compare and not self._result_data.communication_dict: + print(f"[WARNING] Can't find any communication op in the file: {self._profiling_path}") + + def _read_trace_event(self): + try: + self._trace_events = FileReader.read_trace_file(self._json_path) + except Exception: + print(f"[ERROR] Failed to read the file: {self._json_path}") diff --git a/profiler/compare_tools_review/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools_review/compare_backend/profiling_parser/gpu_profiling_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..c4089aec9bdcb35b80ae9ff9121fcd75bde3a63e --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/profiling_parser/gpu_profiling_parser.py @@ -0,0 +1,189 @@ +import sys +from collections import defaultdict, Counter + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.profiling_parser.base_profiling_parser import BaseProfilingParser +from compare_backend.utils.constant import Constant + + +class GPUProfilingParser(BaseProfilingParser): + CUBE_MARK = ['gemm', 'conv', 'cutlass', 'wgrad'] + FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel'], ['attention', 'kernel']] + SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)'] + FLOW_CAT = ("async_gpu", "async_cpu_to_gpu", "ac2g", "async") + TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator", "runtime") + + def __init__(self, args: any, path_dict: dict): + super().__init__(args, path_dict) + self._trace_events = [TraceEventBean(event) for event in self._trace_events.get("traceEvents", [])] + self._flow_cat = (args.gpu_flow_cat,) if args.gpu_flow_cat else self.FLOW_CAT + self._compute_stream_id = self._infer_compute_stream_id() + self._marks = defaultdict(int) + self._aten_index = 0 + + @classmethod + def __is_flash_attention(cls, name: str): + for fa_mark in cls.FA_MARK_LIST: + if not [1 for mark in fa_mark if mark not in name.lower()]: + return True + return False + + @classmethod + def __is_sdma_time(cls, name: str): + for mark in cls.SDMA_MARK_LIST: + if mark in name.lower(): + return True + return False + + def _update_memory_list(self): + if not self._enable_memory_compare: + return + self._memory_events.sort(key=lambda x: x.start_time) + addr_dict = {} + for memory_event in self._memory_events: + allocate_bytes = memory_event.bytes_kb + record = addr_dict.get(memory_event.addr) + if allocate_bytes > 0: + if record: + self._result_data.update_memory_list(record) + addr_dict[memory_event.addr] = {Constant.SIZE: allocate_bytes, + Constant.TS: memory_event.start_time, + Constant.ALLOCATION_TIME: memory_event.start_time} + if allocate_bytes < 0 and record: + if abs(allocate_bytes) == record.get(Constant.SIZE): + record[Constant.RELEASE_TIME] = memory_event.start_time + self._result_data.update_memory_list(record) + del addr_dict[memory_event.addr] + for record in addr_dict.values(): + self._result_data.update_memory_list(record) + + def _update_overall_metrics(self): + self._calculate_performance_time() + self.__parse_memory_reserved() + self._result_data.overall_metrics.calculate_vec_time() + self._result_data.overall_metrics.calculate_schedule_time() + self._result_data.overall_metrics.trans_time_to_s() + + def _calculate_performance_time(self): + min_ts = sys.float_info.max + max_ts = sys.float_info.min + self._trace_events.sort(key=lambda x: x.start_time) + aten_events = list(filter(lambda x: x.name.startswith("aten::"), self._trace_events)) + flow_dict_new = {} + for flow_event in self._flow_dict.values(): + start_event = flow_event.get("start") + end_event = flow_event.get("end") + if start_event and end_event: + flow_dict_new[end_event.start_time] = start_event.start_time + for event in self._trace_events: + if event.stream: + min_ts = min(event.start_time, min_ts) + max_ts = max(event.end_time, max_ts) + if event.stream == self._compute_stream_id and self.__is_sdma_time(event.name): + self._result_data.overall_metrics.update_sdma_info(event.dur) + continue + if not event.is_kernel_cat(): + continue + self.__add_marks(event) + if event.is_nccl_name(): + continue + self.__add_compute_time(event, aten_events, flow_dict_new) + self._aten_events = None + self._result_data.overall_metrics.set_e2e_time(float(max_ts - min_ts)) + self.__add_compute_and_overlap_time() + + def __add_compute_and_overlap_time(self): + compute_time = len([_ for _, value in self._marks.items() if value < 0]) + communication_not_overlapped = len([_ for _, value in self._marks.items() if value > 0]) + self._result_data.overall_metrics.set_compute_time(compute_time) + self._result_data.overall_metrics.set_comm_not_overlap(communication_not_overlapped) + + def __add_marks(self, event: TraceEventBean): + if event.is_nccl_name(): + for timestep in range(int(event.start_time + 1), int(event.end_time + 1)): + self._marks[str(timestep)] += 1 # mark this timestep in communication stream + else: + for timestep in range(int(event.start_time + 1), int(event.end_time + 1)): + self._marks[str(timestep)] += -100 # mark this timestep in compute stream + + def __add_compute_time(self, event: TraceEventBean, aten_events: list, flow_dict_new: dict): + if self.__is_flash_attention(event.name): + if event.is_backward(): + self._result_data.overall_metrics.update_fa_bwd_info(event.dur) + else: + self._result_data.overall_metrics.update_fa_fwd_info(event.dur) + elif any(cube_mark in event.lower_name for cube_mark in self.CUBE_MARK): + is_conv = self.__check_is_conv(event, aten_events, flow_dict_new) + if is_conv == "conv_fwd": + self._result_data.overall_metrics.update_conv_fwd_info(event.dur) + elif is_conv == "conv_bwd": + self._result_data.overall_metrics.update_conv_bwd_info(event.dur) + else: + self._result_data.overall_metrics.update_cube_info(event.dur) + else: + self._result_data.overall_metrics.update_vec_info(event.dur) + + def __check_is_conv(self, event: TraceEventBean, aten_events: list, flow_dict_new: dict) -> str: + flow_start_time = flow_dict_new.get(event.start_time) + if not flow_start_time: + return "" + aten_len = len(aten_events) + while self._aten_index < aten_len: + cur_aten = aten_events[self._aten_index] + if cur_aten.end_time < flow_start_time: + self._aten_index += 1 + continue + if cur_aten.start_time < flow_start_time: + if cur_aten.is_conv(): + return "conv_bwd" if cur_aten.is_backward() else "conv_fwd" + return "" + + def _picking_memory_event(self, event: TraceEventBean): + if event.is_memory_event(): + self._memory_events.append(event) + return True + return False + + def _is_torch_op_event(self, event: TraceEventBean): + return event.lower_cat in self.TORCH_OP_CAT + + def _is_kernel_event(self, event: TraceEventBean): + return event.is_kernel_cat() + + def _is_flow_event(self, event: TraceEventBean): + return event.lower_cat in self._flow_cat + + def __parse_memory_reserved(self): + if not self._memory_events: + print("[INFO] Gpu profiling data doesn't contain memory info.") + return + memory_used = max([event.total_reserved for event in self._memory_events]) / 1024 ** 3 + self._result_data.overall_metrics.set_memory_used(memory_used) + + def _get_dispatch_func(self): + func_set = set() + if self._enable_memory_compare or self._enable_operator_compare: + func_set.add(self._picking_torch_op_event) + if self._enable_communication_compare: + func_set.add(self._picking_kernel_event) + if self._enable_operator_compare: + func_set.add(self._picking_python_function_event) + func_set.add(self._picking_fwdbwd_flow_event) + if self._enable_operator_compare or self._args.max_kernel_num: + func_set.add(self._picking_kernel_event) + func_set.add(self._picking_flow_event) + if self._enable_memory_compare or self._enable_profiling_compare: + func_set.add(self._picking_memory_event) + return list(func_set) + + def _infer_compute_stream_id(self): + if not self._enable_profiling_compare: + return -1 + kernel_stream_ids = [] + for event in self._trace_events: + if event.is_kernel_except_nccl() and event.stream: + kernel_stream_ids.append(event.stream) + if not kernel_stream_ids: + raise RuntimeError('[ERROR] The profiling data does not contain kernel running data.') + counter = Counter(kernel_stream_ids) + return counter.most_common(1)[0][0] diff --git a/profiler/compare_tools_review/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools_review/compare_backend/profiling_parser/npu_profiling_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..70ce44b44eb419196dc479dc30ae0b1e4a1136cb --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/profiling_parser/npu_profiling_parser.py @@ -0,0 +1,323 @@ +import os +import sys +from math import ceil + +from compare_backend.compare_bean.origin_data_bean.kernel_details_bean import KernelDetailsBean +from compare_backend.compare_bean.origin_data_bean.memory_record_bean import MemoryRecordBean +from compare_backend.compare_bean.origin_data_bean.operator_memory_bean import OperatorMemoryBean +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.profiling_parser.base_profiling_parser import BaseProfilingParser +from compare_backend.utils.constant import Constant +from compare_backend.utils.file_reader import FileReader + + +class NPUProfilingParser(BaseProfilingParser): + FLOW_CAT = "async_npu" + TORCH_OP_CAT = "cpu_op" + ACTIVE_CPU = "ProfilerActivity.CPU" + LEVEL_0 = "Level0" + + def __init__(self, args: any, path_dict: dict): + super().__init__(args, path_dict) + self._operator_memory_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "operator_memory.csv") + self._memory_record_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "memory_record.csv") + self._kernel_detail_path = os.path.join(path_dict.get(Constant.ASCEND_OUTPUT_PATH, ""), "kernel_details.csv") + self._info_json_path = path_dict.get(Constant.INFO_JSON_PATH, "") + self._trace_events = [TraceEventBean(event) for event in self._trace_events] + self._hccl_pid = None + self._hccl_op_tid_list = [] + self._kernel_pid = None + self._overlap_pid = None + self._enqueue_dict = {} + self._dequeue_data = [] + self._overlap_analysis = [] + self._dispatch_func = self._get_dispatch_func() + self._filter_meta_id() + + def _get_dispatch_func(self): + func_list = set() + if self._enable_memory_compare or self._enable_operator_compare: + func_list.add(self._picking_torch_op_event) + if self._enable_operator_compare or self._args.max_kernel_num: + func_list.add(self._picking_kernel_event) + func_list.add(self._picking_flow_event) + if self._enable_operator_compare: + func_list.add(self._picking_python_function_event) + func_list.add(self._picking_fwdbwd_flow_event) + if self._enable_memory_compare: + func_list.add(self._picking_task_queue_data) + if self._enable_communication_compare: + func_list.add(self._picking_hccl_event) + if self._enable_profiling_compare: + func_list.add(self._picking_overlap_analysis_data) + func_list.add(self._picking_kernel_event) + func_list.add(self._picking_hccl_event) + return list(func_list) + + def _update_memory_list(self): + try: + memory_data = FileReader.read_csv_file(self._operator_memory_path, OperatorMemoryBean) + except FileNotFoundError: + print("[WARNING] The file operator_memory.csv does not exist.") + return + except Exception: + print("[ERROR] Failed to read operator_memory.csv.") + return + if memory_data: + self._dequeue_data.sort(key=lambda x: x.start_time) + for data in memory_data: + if not data.allocation_time: + continue + if data.is_cann_op(): + matched_corr_id = self.__match_dequeue_data(data.allocation_time) + if matched_corr_id == Constant.INVALID_VALUE: + continue + self._result_data.update_memory_list({Constant.SIZE: data.size, + Constant.TS: self._enqueue_dict.get(matched_corr_id, 0), + Constant.NAME: data.name, + Constant.ALLOCATION_TIME: data.allocation_time, + Constant.RELEASE_TIME: data.release_time}) + else: + self._result_data.update_memory_list({Constant.SIZE: data.size, + Constant.TS: data.allocation_time, + Constant.ALLOCATION_TIME: data.allocation_time, + Constant.RELEASE_TIME: data.release_time}) + + def __match_dequeue_data(self, ts_time: float) -> int: + if not self._dequeue_data: + return Constant.INVALID_VALUE + left, right = 0, len(self._dequeue_data) - 1 + while right > left: + mid = left + ceil((right - left) / 2) + if ts_time >= self._dequeue_data[mid].start_time: + left = mid + else: + right = mid - 1 + return self._dequeue_data[left].corr_id if self._dequeue_data[left].start_time <= ts_time <= \ + self._dequeue_data[left].end_time else Constant.INVALID_VALUE + + def _update_overall_metrics(self): + self.__parse_info_json() + self.__parse_mem_csv() + self.__parse_kernel_csv() + self.__add_lccl_time() + self.__add_sdma_time() + self.__add_overlap_analysis_time() + self._picking_notify_wait_event_and_not_overlap_event() + self.__add_overlap_wait_time() + self._result_data.overall_metrics.calculate_other_time() + self._result_data.overall_metrics.calculate_schedule_time() + self._result_data.overall_metrics.trans_time_to_s() + + def _picking_notify_wait_event_and_not_overlap_event(self): + self.notify_event_cache = [] + self._not_overlaped_commu_event = [] + for event in self._comm_task_list: + if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK' \ + and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_ACK': + self.notify_event_cache.append(event) + for event in self._overlap_analysis: + if event.is_comm_not_overlap(): + self._not_overlaped_commu_event.append(event) + self._not_overlaped_commu_event.sort(key=lambda x: x.start_time) + + def __add_overlap_wait_time(self): + notify_wait_event_dict = dict() + for notify_event in self.notify_event_cache: + if notify_event.tid in notify_wait_event_dict: + notify_wait_event_dict[notify_event.tid].append(notify_event) + else: + notify_wait_event_dict[notify_event.tid] = [notify_event] + + if self._result_data.overall_metrics.is_level0: + return + + total_time = 0 + for commu_event in self._not_overlaped_commu_event: + wait_time_list = [0] + commu_event_start_time = float(commu_event.start_time) + commu_event_end_time = float(commu_event.start_time) + commu_event.dur + + for plane_id, events in notify_wait_event_dict.items(): + wait_time = 0 + idx = 0 + for notify_event in events: + notify_event_start_time = float(notify_event.start_time) + notify_event_end_time = float(notify_event.start_time) + notify_event.dur + if notify_event_start_time < commu_event_start_time and notify_event_end_time > \ + commu_event_end_time: + wait_time = commu_event_end_time - commu_event_start_time + break + elif notify_event_start_time < commu_event_start_time <= notify_event_end_time <= \ + commu_event_end_time: + wait_time += notify_event_end_time - commu_event_start_time + idx += 1 + elif commu_event_start_time <= notify_event_start_time <= commu_event_end_time < \ + notify_event_end_time: + wait_time += commu_event_end_time - notify_event_start_time + break + elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= \ + commu_event_end_time: + wait_time += notify_event_end_time - notify_event_start_time + idx += 1 + elif notify_event_end_time < commu_event_start_time: + idx += 1 + else: + break + + wait_time_list.append(wait_time) + notify_wait_event_dict[plane_id] = notify_wait_event_dict[plane_id][idx:] + total_time += max(wait_time_list) + self._result_data.overall_metrics.update_comm_not_overlap_wait_time(total_time) + + def _picking_hccl_event(self, event: TraceEventBean): + if event.pid != self._hccl_pid or not event.is_x_mode(): + return False + if event.tid in self._hccl_op_tid_list: + self._comm_list.append(event) + else: + self._comm_task_list.append(event) + return True + + def _picking_task_queue_data(self, event: TraceEventBean): + if event.is_enqueue(): + self._enqueue_dict[event.corr_id] = event.start_time + return True + elif event.is_dequeue(): + self._dequeue_data.append(event) + return True + return False + + def _picking_overlap_analysis_data(self, event: TraceEventBean): + if event.pid == self._overlap_pid and event.is_x_mode(): + self._overlap_analysis.append(event) + return True + return False + + def _is_kernel_event(self, event: TraceEventBean): + return event.pid == self._kernel_pid and event.is_x_mode() + + def _is_flow_event(self, event: TraceEventBean): + return event.lower_cat == self.FLOW_CAT + + def _is_torch_op_event(self, event: TraceEventBean): + return event.lower_cat == self.TORCH_OP_CAT + + def _filter_meta_id(self): + for event in self._trace_events: + if not event.is_process_meta(): + continue + if event.is_hccl_process_name(): + self._hccl_pid = event.pid + elif event.is_npu_process_name(): + self._kernel_pid = event.pid + elif event.is_overlap_process_name(): + self._overlap_pid = event.pid + if not self._enable_communication_compare: + return + for event in self._trace_events: + if not event.is_thread_meta(): + continue + if event.pid == self._hccl_pid and event.is_communication_op_thread(): + self._hccl_op_tid_list.append(event.tid) + + def __parse_info_json(self): + try: + json_data = FileReader.read_trace_file(self._info_json_path) + except Exception: + print('[WARNING] Failed to read profiler_info.json.') + return + if not isinstance(json_data, dict) or not json_data: + print('[WARNING] Invalid profiler info.') + return + level = json_data.get('config', {}).get('experimental_config', {}).get('_profiler_level', '') + if self.LEVEL_0 != level: + return + self._result_data.overall_metrics.is_level0 = True + if self.ACTIVE_CPU in json_data.get('config', {}).get('common_config', {}).get('activities', []): + return + self._result_data.overall_metrics.minimal_profiling = True + + def __add_lccl_time(self): + for event in self._all_kernels.values(): + if event.is_lccl(): + self._result_data.overall_metrics.update_lccl_info(event.dur) + + def __parse_kernel_csv(self): + try: + kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) + except Exception: + print('[WARNING] Npu kernel details csv file is not available.') + return + if not kernel_details or kernel_details[0].is_hide_op_pmu(): + self._result_data.overall_metrics.hide_op_details = True + return + for kernel in kernel_details: + if kernel.is_invalid(): + continue + if kernel.is_flash_attention(): + if kernel.is_fa_bwd(): + self._result_data.overall_metrics.update_fa_bwd_info(kernel.duration) + else: + self._result_data.overall_metrics.update_fa_fwd_info(kernel.duration) + elif kernel.is_conv(): + if kernel.is_conv_bwd(): + self._result_data.overall_metrics.update_conv_bwd_info(kernel.duration) + else: + self._result_data.overall_metrics.update_conv_fwd_info(kernel.duration) + elif kernel.is_cube(): + self._result_data.overall_metrics.update_cube_info(kernel.duration) + elif kernel.is_sdma(): + self._result_data.overall_metrics.update_sdma_info(kernel.duration) + elif kernel.is_page_attention(): + self._result_data.overall_metrics.update_pa_info(kernel.duration) + elif kernel.is_vector(): + self._result_data.overall_metrics.update_vec_info(kernel.duration) + else: + self._result_data.overall_metrics.update_cube_info(kernel.duration) + + def __parse_mem_csv(self): + try: + memory_record = FileReader.read_csv_file(self._memory_record_path, MemoryRecordBean) + except FileNotFoundError: + print('[INFO] Npu memory record csv file is not available.') + except Exception: + print('[WARNING] Load memory info failed.') + else: + memory_used = max([memory.total_reserved_mb for memory in memory_record]) / 1024 + self._result_data.overall_metrics.set_memory_used(memory_used) + + def __add_overlap_analysis_time(self): + if not self._overlap_analysis: + print('[ERROR] Failed to get overlap analysis data.') + return + min_ts = sys.float_info.max + max_ts = sys.float_info.min + for event in self._overlap_analysis: + if event.is_computing_event(): + self._result_data.overall_metrics.update_compute_time(event.dur) + min_ts = min(event.start_time, min_ts) + max_ts = max(event.end_time, max_ts) + elif event.is_comm_not_overlap(): + self._result_data.overall_metrics.update_comm_not_overlap(event.dur) + min_ts = min(event.start_time, min_ts) + max_ts = max(event.end_time, max_ts) + self._result_data.overall_metrics.set_e2e_time(float(max_ts - min_ts)) + + def __add_sdma_time(self) -> (float, int): + event_wait_stream, ai_core_stream = set(), set() + sdma_dict = {} + for event in self._all_kernels.values(): + stream_id = event.stream_id + if not stream_id: + continue + if event.is_event_wait(): + event_wait_stream.add(stream_id) + elif event.is_sdma_event(): + sdma_dict.setdefault(stream_id, []).append(event.dur) + elif event.is_compute_event(): + ai_core_stream.add(stream_id) + compute_stream = event_wait_stream & ai_core_stream if event_wait_stream else ai_core_stream + for stream in compute_stream: + dur_list = sdma_dict.get(stream, []) + self._result_data.overall_metrics.update_sdma_info(sum(dur_list), len(dur_list)) diff --git a/profiler/compare_tools_review/compare_backend/utils/__init__.py b/profiler/compare_tools_review/compare_backend/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/utils/args_manager.py b/profiler/compare_tools_review/compare_backend/utils/args_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..4b5947fa7bccc32277bb9d18d97ab71249c66941 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/args_manager.py @@ -0,0 +1,136 @@ +import os.path +import re + +from common_func.path_manager import PathManager +from compare_backend.utils.constant import Constant +from compare_backend.utils.file_reader import FileReader + + +class Singleton(object): + def __init__(self, cls): + self._cls = cls + self._instance = {} + + def __call__(self): + if self._cls not in self._instance: + self._instance[self._cls] = self._cls() + return self._instance[self._cls] + + +@Singleton +class ArgsManager: + + def __init__(self): + self._args = None + self._base_path_dict = {} + self._comparison_path_dict = {} + + @property + def args(self): + return self._args + + @property + def base_profiling_type(self): + return self._base_path_dict.get(Constant.PROFILING_TYPE) + + @property + def comparison_profiling_type(self): + return self._comparison_path_dict.get(Constant.PROFILING_TYPE) + + @property + def base_profiling_path(self): + return self._args.base_profiling_path + + @property + def comparison_profiling_path(self): + return self._args.comparison_profiling_path_dict + + @property + def base_path_dict(self): + return self._base_path_dict + + @property + def comparison_path_dict(self): + return self._comparison_path_dict + + @property + def enable_profiling_compare(self): + return self._args.enable_profiling_compare + + @property + def enable_operator_compare(self): + return self._args.enable_operator_compare + + @property + def enable_memory_compare(self): + return self._args.enable_memory_compare + + @property + def enable_communication_compare(self): + return self._args.enable_communication_compare + + @classmethod + def check_profiling_path(cls, file_path: str): + PathManager.input_path_common_check(file_path) + PathManager.check_path_owner_consistent(file_path) + + @classmethod + def check_output_path(cls, output_path: str): + PathManager.check_input_directory_path(output_path) + PathManager.make_dir_safety(output_path) + PathManager.check_path_writeable(output_path) + + def parse_profiling_path(self, file_path: str): + self.check_profiling_path(file_path) + if os.path.isfile(file_path): + (split_file_path, split_file_name) = os.path.split(file_path) + (shot_name, extension) = os.path.splitext(split_file_name) + if extension != ".json": + msg = f"Invalid profiling path suffix: {file_path}" + raise RuntimeError(msg) + json_type = FileReader.check_json_type(file_path) + return {Constant.PROFILING_TYPE: json_type, Constant.PROFILING_PATH: file_path, + Constant.TRACE_PATH: file_path} + ascend_output = os.path.join(file_path, "ASCEND_PROFILER_OUTPUT") + profiler_output = ascend_output if os.path.isdir(ascend_output) else file_path + json_path = os.path.join(profiler_output, "trace_view.json") + if not os.path.isfile(json_path): + msg = (f"The data is not collected by PyTorch Adaptor mode or the data is not parsed. " + f"Invalid profiling path: {profiler_output}") + raise RuntimeError(msg) + path_dict = {Constant.PROFILING_TYPE: Constant.NPU, Constant.PROFILING_PATH: file_path, + Constant.TRACE_PATH: json_path, Constant.ASCEND_OUTPUT_PATH: profiler_output} + sub_dirs = os.listdir(file_path) + for dir_name in sub_dirs: + if dir_name == "profiler_info.json" or re.match(r"profiler_info_[0-9]+\.json", dir_name): + path_dict.update({Constant.INFO_JSON_PATH: os.path.join(file_path, dir_name)}) + return path_dict + + def init(self, args: any): + self._args = args + if self._args.max_kernel_num is not None and self._args.max_kernel_num <= Constant.LIMIT_KERNEL: + msg = f"Invalid param, --max_kernel_num has to be greater than {Constant.LIMIT_KERNEL}" + raise RuntimeError(msg) + if not isinstance(self._args.op_name_map, dict): + raise RuntimeError( + "Invalid param, --op_name_map must be dict, for example: --op_name_map={'name1':'name2'}") + if self._args.gpu_flow_cat and len(self._args.gpu_flow_cat) > Constant.MAX_FLOW_CAT_LEN: + msg = f"Invalid param, --gpu_flow_cat exceeded the maximum value {Constant.MAX_FLOW_CAT_LEN}" + raise RuntimeError(msg) + + if not any([self._args.enable_profiling_compare, self._args.enable_operator_compare, + self._args.enable_memory_compare, self._args.enable_communication_compare]): + self._args.enable_profiling_compare = True + self._args.enable_operator_compare = True + self._args.enable_memory_compare = True + self._args.enable_communication_compare = True + + base_profiling_path = PathManager.get_realpath(self._args.base_profiling_path) + self.check_profiling_path(base_profiling_path) + self._base_path_dict = self.parse_profiling_path(base_profiling_path) + comparison_profiling_path = PathManager.get_realpath(self._args.comparison_profiling_path) + self.check_profiling_path(comparison_profiling_path) + self._comparison_path_dict = self.parse_profiling_path(comparison_profiling_path) + + if self._args.output_path: + self.check_output_path(PathManager.get_realpath(self._args.output_path)) diff --git a/profiler/compare_tools_review/compare_backend/utils/common_func.py b/profiler/compare_tools_review/compare_backend/utils/common_func.py new file mode 100644 index 0000000000000000000000000000000000000000..a3cab286e33a9d474e85d0b51023d73edc22ca56 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/common_func.py @@ -0,0 +1,92 @@ +from decimal import Decimal + +import numpy + + +def calculate_diff_ratio(base_value: float, comparison_value: float): + if not base_value and not comparison_value: + ratio = 1.0 + else: + ratio = float('inf') if not base_value else comparison_value / base_value + return [comparison_value - base_value, ratio] + + +def update_order_id(data_list: list): + for index, data in enumerate(data_list): + if data: + data[0] = index + 1 + + +def convert_to_float(data: any) -> float: + try: + float_value = float(data) + except Exception: + print('[ERROR] Invalid profiling data which failed to convert data to float.') + return 0.0 + return float_value + + +def convert_to_decimal(data: any) -> Decimal: + try: + decimal_value = Decimal(data) + except Exception: + print('[ERROR] Invalid profiling data which failed to convert data to decimal.') + return 0.0 + return decimal_value + + +def longest_common_subsequence_matching(base_ops: list, comparison_ops: list, name_func: any) -> list: + if not comparison_ops: + result_data = [None] * len(base_ops) + for index, value in enumerate(base_ops): + result_data[index] = [value, None] + return result_data + + comparison_len, base_len = len(comparison_ops), len(base_ops) + dp_flag = numpy.zeros(shape=(comparison_len + 1, base_len + 1), dtype=int) + pre_list = [0] * (base_len + 1) + cur_list = [0] * (base_len + 1) + + comparison_index = 1 + iter_comparison_data = iter(comparison_ops) + for comparison_data in iter_comparison_data: + base_index = 1 + iter_base_data = iter(base_ops) + for base_data in iter_base_data: + if name_func(comparison_data) == name_func(base_data): + cur_list[base_index] = pre_list[base_index - 1] + 1 + else: + only_base = cur_list[base_index - 1] + only_comparison = pre_list[base_index] + if only_base < only_comparison: + dp_flag[comparison_index][base_index] = 1 # 1 for only comparison op + cur_list[base_index] = only_comparison + else: + cur_list[base_index] = only_base + base_index += 1 + pre_list = cur_list + comparison_index += 1 + + matched_op = [] + comparison_index, base_index = comparison_len, base_len + while comparison_index > 0 and base_index > 0: + base_data = base_ops[base_index - 1] + comparison_data = comparison_ops[comparison_index - 1] + if name_func(base_data) == name_func(comparison_data): + matched_op.append([base_data, comparison_data]) + comparison_index -= 1 + base_index -= 1 + elif dp_flag[comparison_index][base_index] == 1: # 1 for only comparison op + matched_op.append([None, comparison_data]) + comparison_index -= 1 + else: + matched_op.append([base_data, None]) + base_index -= 1 + while comparison_index > 0: + matched_op.append([None, comparison_ops[comparison_index - 1]]) + comparison_index -= 1 + while base_index > 0: + matched_op.append([base_ops[base_index - 1], None]) + base_index -= 1 + matched_op.reverse() + return matched_op diff --git a/profiler/compare_tools_review/compare_backend/utils/compare_args.py b/profiler/compare_tools_review/compare_backend/utils/compare_args.py new file mode 100644 index 0000000000000000000000000000000000000000..ab9bc364f440ca8412a6e40d67ca74b7c897cbd9 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/compare_args.py @@ -0,0 +1,24 @@ +class Args: + def __init__(self, + base_profiling_path: str = "", + comparison_profiling_path: str = "", + enable_profiling_compare: bool = False, + enable_operator_compare: bool = False, + enable_memory_compare: bool = False, + enable_communication_compare: bool = False, + output_path: str = "", + max_kernel_num: int = None, + op_name_map: dict = {}, + use_input_shape: bool = False, + gpu_flow_cat: str = ""): + self.base_profiling_path = base_profiling_path + self.comparison_profiling_path = comparison_profiling_path + self.enable_profiling_compare = enable_profiling_compare + self.enable_operator_compare = enable_operator_compare + self.enable_memory_compare = enable_memory_compare + self.enable_communication_compare = enable_communication_compare + self.output_path = output_path + self.max_kernel_num = max_kernel_num + self.op_name_map = op_name_map + self.use_input_shape = use_input_shape + self.gpu_flow_cat = gpu_flow_cat diff --git a/profiler/compare_tools_review/compare_backend/utils/constant.py b/profiler/compare_tools_review/compare_backend/utils/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..1b77b214c85f6733e36298e119e43a778fd7969f --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/constant.py @@ -0,0 +1,80 @@ +class Constant(object): + GPU = "GPU" + NPU = "NPU" + NA = 'N/A' + LIMIT_KERNEL = 3 + MAX_PATH_LENGTH = 4096 + MAX_FLOW_CAT_LEN = 20 + MAX_FILE_SIZE = 1024 * 1024 * 1024 * 5 + BYTE_TO_KB = 1024 + YELLOW_COLOR = "FFFF00" + GREEN_COLOR = "00FF00" + RED_COLOR = "FF0000" + BLUE_COLOR = "00BFFF" + US_TO_MS = 1000 + KB_TO_MB = 1024 + INVALID_VALUE = -1 + + # epsilon + EPS = 1e-15 + + # autority + FILE_AUTHORITY = 0o640 + DIR_AUTHORITY = 0o750 + + PROFILING_TYPE = "profiling type" + + # path + PROFILING_PATH = "profiling_path" + TRACE_PATH = "trace_path" + MEMORY_DATA_PATH = "memory_data_path" + ASCEND_OUTPUT_PATH = "ascend_output" + INFO_JSON_PATH = "info_path" + + # excel headers + BASE_PROFILING = 'Base Profiling: ' + COMPARISON_PROFILING = 'Comparison Profiling: ' + + # compare type + OPERATOR_COMPARE = "OperatorCompare" + MEMORY_COMPARE = "MemoryCompare" + + # sheet name + OPERATOR_SHEET = "OperatorCompare" + MEMORY_SHEET = "MemoryCompare" + OPERATOR_TOP_SHEET = "OperatorCompareStatistic" + MEMORY_TOP_SHEET = "MemoryCompareStatistic" + COMMUNICATION_SHEET = "CommunicationCompare" + + # table name + OPERATOR_TABLE = "OperatorCompare" + MEMORY_TABLE = "MemoryCompare" + OPERATOR_TOP_TABLE = "OperatorCompareStatistic" + MEMORY_TOP_TABLE = "MemoryCompareStatistic" + COMMUNICATION_TABLE = "CommunicationCompare" + PERFORMANCE_TABLE = "Model Profiling Time Distribution" + MODULE_TABLE = "ModuleCompare" + MODULE_TOP_TABLE = "ModuleCompareStatistic" + + # memory + SIZE = "Size(KB)" + TS = "ts" + ALLOCATION_TIME = "Allocation Time(us)" + RELEASE_TIME = "Release Time(us)" + NAME = "Name" + + OP_KEY = "op_name" + DEVICE_DUR = "dur" + + BASE_DATA = "base_data" + COMPARISON_DATA = "comparison_data" + OVERALL_METRICS = "overall_metrics" + TORCH_OP = "torch_op" + KERNEL_DICT = "kernel_dict" + MEMORY_LIST = "memory_list" + COMMUNICATION_DICT = "comm_dict" + + #compare type + OVERALL_COMPARE = "overall" + + BWD_LIST = ["bwd", "backward", "back"] diff --git a/profiler/compare_tools_review/compare_backend/utils/excel_config.py b/profiler/compare_tools_review/compare_backend/utils/excel_config.py new file mode 100644 index 0000000000000000000000000000000000000000..306abcdfec6e62f24977b989258ad190a90c9bd7 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/excel_config.py @@ -0,0 +1,185 @@ +from compare_backend.utils.constant import Constant + + +class CellFormatType: + DEFAULT = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'num_format': '#,##0'} # 数字显示整数,无背景色 + DEFAULT_FLOAT = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'num_format': '#,##0.00'} # 保留2位小数,无背景色 + DEFAULT_RATIO = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', + 'border': True, 'num_format': '0.00%'} # 百分比显示,保留2位小数,无背景色 + RED_RATIO = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', + 'border': True, 'num_format': '0.00%', "fg_color": Constant.RED_COLOR} # 百分比显示,保留2位小数,单元格背景色为红色 + BOLD_STR = {"font_name": "Arial", 'font_size': 11, 'align': 'left', 'valign': 'vcenter', 'border': True, + 'bold': True} # 字符串,无背景色,字体加粗 + BLUE_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.BLUE_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 蓝色背景,加粗 + GREEN_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.GREEN_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 绿色背景,加粗 + YELLOW_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.YELLOW_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 黄色背景,加粗 + + +class ExcelConfig(object): + ORDER = "Order Id" + OPERATOR_NAME = "Operator Name" + INPUT_SHAPE = "Input Shape" + INPUT_TYPE = "Input Type" + KERNEL_DETAILS = "Kernel Details" + MEMORY_DETAILS = "Allocated Details" + DEVICE_DURATION = "Device Duration(us)" + DIFF_RATIO = "Diff Ratio" + DIFF_DUR = "Diff Duration(us)" + DIFF_SIZE = "Diff Size(KB)" + SIZE = "Size(KB)" + TOP = "Top" + BASE_DEVICE_DURATION = "Base Device Duration(ms)" + COMPARISON_DEVICE_DURATION = "Comparison Device Duration(ms)" + BASE_OPERATOR_NUMBER = "Base Operator Number" + COMPARISON_OPERATOR_NUMBER = "Comparison Operator Number" + DIFF_TIME = "Diff Duration(ms)" + BASE_ALLOCATED_TIMES = "Base Allocated Duration(ms)" + COMPARISON_ALLOCATED_TIMES = "Comparison Allocated Duration(ms)" + BASE_ALLOCATED_MEMORY = "Base Allocated Memory(MB)" + COMPARISON_ALLOCATED_MEMORY = "Comparison Allocated Memory(MB)" + DIFF_MEMORY = "Diff Memory(MB)" + COMM_OP_NAME = "Communication OP Name" + TASK_NAME = "Task Name" + CALLS = "Calls" + TOTAL_DURATION = "Total Duration(us)" + AVG_DURATION = "Avg Duration(us)" + MAX_DURATION = "Max Duration(us)" + MIN_DURATION = "Min Duration(us)" + MODULE_CLASS = "Module Class" + MODULE_NAME = "Module Name" + DEVICE_SELF_TIME = "Device Self Time(ms)" + DEVICE_TOTAL_TIME = "Device Total Time(ms)" + DIFF_SELF_TIME = "Device Self Time Diff(ms)" + DIFF_TOTAL_RATIO = "Total Diff Ratio" + DIFF_TOTAL_TIME = "Device Total Time Diff(ms)" + DEVICE_SELF_TIME_US = "Device Self Time(us)" + DEVICE_TOTAL_TIME_US = "Device Total Time(us)" + DIFF_SELF_TIME_US = "Device Self Time Diff(us)" + DIFF_TOTAL_TIME_US = "Device Total Time Diff(us)" + NUMBER = "Number" + MODULE_LEVEL = "Module Level" + BASE_CALL_STACK = "Base Call Stack" + COMPARISON_CALL_STACK = "Comparison Call Stack" + + HEADERS = { + Constant.OPERATOR_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MEMORY_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MEMORY_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": INPUT_SHAPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": INPUT_TYPE, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MEMORY_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SIZE, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.OPERATOR_TOP_TABLE: [ + {"name": TOP, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": BASE_DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 25}, + {"name": BASE_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": COMPARISON_DEVICE_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 30}, + {"name": COMPARISON_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": DIFF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MEMORY_TOP_TABLE: [ + {"name": TOP, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": OPERATOR_NAME, "type": CellFormatType.BOLD_STR, "width": 30}, + {"name": BASE_ALLOCATED_TIMES, "type": CellFormatType.DEFAULT_FLOAT, "width": 25}, + {"name": BASE_ALLOCATED_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 30}, + {"name": BASE_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": COMPARISON_ALLOCATED_TIMES, "type": CellFormatType.DEFAULT_FLOAT, "width": 27}, + {"name": COMPARISON_ALLOCATED_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 33}, + {"name": COMPARISON_OPERATOR_NUMBER, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": DIFF_MEMORY, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.COMMUNICATION_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": COMM_OP_NAME, "type": CellFormatType.BOLD_STR, "width": 25}, + {"name": TASK_NAME, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": COMM_OP_NAME, "type": CellFormatType.BOLD_STR, "width": 25}, + {"name": TASK_NAME, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": CALLS, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": TOTAL_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": AVG_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MAX_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, + {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MODULE_TOP_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": MODULE_CLASS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MODULE_LEVEL, "type": CellFormatType.DEFAULT, "width": 15}, + {"name": MODULE_NAME, "type": CellFormatType.DEFAULT, "width": 35}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, + {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} + ], + Constant.MODULE_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": MODULE_CLASS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MODULE_LEVEL, "type": CellFormatType.DEFAULT, "width": 15}, + {"name": MODULE_NAME, "type": CellFormatType.DEFAULT, "width": 35}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DEVICE_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DEVICE_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, + {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} + ] + } + + OVERHEAD = {Constant.OPERATOR_TABLE: ["B1:F1", "G1:K1"], Constant.MEMORY_TABLE: ["B1:F1", "G1:K1"], + Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"], + Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"], Constant.MODULE_TOP_TABLE: ["F1:I1", "J1:M1"], + Constant.MODULE_TABLE: ["E1:H1", "I1:L1"]} diff --git a/profiler/compare_tools_review/compare_backend/utils/file_reader.py b/profiler/compare_tools_review/compare_backend/utils/file_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..b4ae786388b2f1bed6ad50cfb39ac8621c1ea1f1 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/file_reader.py @@ -0,0 +1,64 @@ +import csv +import json +import os + +from common_func.path_manager import PathManager +from compare_backend.utils.constant import Constant + + +class FileReader: + + @classmethod + def read_trace_file(cls, file_path: str) -> any: + PathManager.check_path_readable(file_path) + if not os.path.isfile(file_path): + raise FileNotFoundError("File not exists.") + file_size = os.path.getsize(file_path) + if file_size <= 0: + return [] + if file_size > Constant.MAX_FILE_SIZE: + check_msg = input( + f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") + if check_msg.lower() != "y": + print(f"[WARNING] The user choose not to read the file: {file_path}") + return [] + try: + with open(file_path, "rt") as file: + json_data = json.loads(file.read()) + except Exception as e: + msg = f"Can't read file: {file_path}" + raise RuntimeError(msg) from e + return json_data + + @classmethod + def read_csv_file(cls, file_path: str, bean_class: any = None) -> any: + PathManager.check_path_readable(file_path) + if not os.path.isfile(file_path): + raise FileNotFoundError("File not exists.") + file_size = os.path.getsize(file_path) + if file_size <= 0: + return [] + if file_size > Constant.MAX_FILE_SIZE: + check_msg = input( + f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") + if check_msg.lower() != "y": + print(f"[WARNING] The user choose not to read the file: {file_path}") + return [] + result_data = [] + try: + with open(file_path, newline="") as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + row_data = bean_class(row) if bean_class else row + result_data.append(row_data) + except Exception as e: + msg = f"Failed to read the file: {file_path}" + raise RuntimeError(msg) from e + return result_data + + @classmethod + def check_json_type(cls, file_path: str) -> str: + json_data = cls.read_trace_file(file_path) + if isinstance(json_data, dict): + return Constant.GPU + return Constant.NPU diff --git a/profiler/compare_tools_review/compare_backend/utils/module_node.py b/profiler/compare_tools_review/compare_backend/utils/module_node.py new file mode 100644 index 0000000000000000000000000000000000000000..f85606094ede7abc378c1b3d017b4a98c8800107 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/module_node.py @@ -0,0 +1,171 @@ +import re +from math import ceil + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.torch_op_node import TorchOpNode + + +class ModuleNode: + ts = "ts" + kernels = "kernels" + + def __init__(self, event: TraceEventBean, parent_node=None): + self._event = event + self._parent_node = parent_node + self._child_nodes = [] + self._module_name = f"{parent_node.module_name}/{event.name}" if parent_node else event.name + self._module_level = parent_node.module_level + 1 if parent_node else 1 + self._kernel_self_list = [] + self._kernel_total_list = [] + self._call_stack = f"{parent_node.call_stack};\n{event.name}" if parent_node and parent_node.call_stack \ + else event.name + self._root_torch_op_node = TorchOpNode() + self._cur_torch_op_node = self._root_torch_op_node + + @property + def module_name(self): + return self._module_name + + @property + def module_class(self): + pattern = re.compile('_[0-9]+$') + return pattern.sub('', self.name.split("/")[-1]) + + @property + def module_level(self): + return self._module_level + + @property + def name(self): + return self._event.name + + @property + def parent_node(self): + return self._parent_node + + @property + def child_nodes(self): + return self._child_nodes + + @property + def dur(self): + return self._event.dur + + @property + def start_time(self): + return self._event.start_time + + @property + def end_time(self): + return self._event.end_time + + @property + def host_self_dur(self): + return self.dur - sum([node.dur for node in self.child_nodes]) + + @property + def device_self_dur(self): + dur = 0 + for kernel_dict in self._kernel_self_list: + kernel_list = kernel_dict.get(self.kernels, []) + dur += sum([kernel.device_dur for kernel in kernel_list]) + return dur + + @property + def device_total_dur(self): + dur = 0 + for kernel_dict in self._kernel_total_list: + kernel_list = kernel_dict.get(self.kernels, []) + dur += sum([kernel.device_dur for kernel in kernel_list]) + return dur + + @property + def kernel_details(self): + kernel_details = "" + for kernel_dict in self._kernel_self_list: + kernel_list = kernel_dict.get(self.kernels, []) + for kernel in kernel_list: + kernel_details += kernel.kernel_details + return kernel_details + + @property + def toy_layer_api_list(self): + return self._root_torch_op_node.child_nodes + + @property + def call_stack(self): + return self._call_stack + + @staticmethod + def _binary_search(ts_time, parent_node): + if not parent_node.child_nodes: + return None + right = len(parent_node.child_nodes) - 1 + left = 0 + while right > left: + mid = left + ceil((right - left) / 2) + if ts_time >= parent_node.child_nodes[mid].start_time: + left = mid + else: + right = mid - 1 + if parent_node.child_nodes[left].start_time < ts_time < parent_node.child_nodes[left].end_time: + return parent_node.child_nodes[left] + return None + + def reset_call_stack(self, call_stack): + self._call_stack = call_stack + + def update_child_nodes(self, node): + self._child_nodes.append(node) + + def update_kernel_list(self, ts, kernel_list: list): + self._update_kernel_self_list(ts, kernel_list) + node = self + while node.parent_node: + node._update_kernel_total_list(ts, kernel_list) + node = node.parent_node + + def _update_kernel_self_list(self, ts, kernel_list: list): + self._kernel_self_list.append({self.ts: ts, self.kernels: kernel_list}) + + def _update_kernel_total_list(self, ts, kernel_list: list): + self._kernel_total_list.append({self.ts: ts, self.kernels: kernel_list}) + + def find_module_call(self, ts_time): + call_module = self._binary_search(ts_time, self) + while call_module: + module = self._binary_search(ts_time, call_module) + if not module: + return call_module + call_module = module + return call_module + + def find_torch_op_call(self, event): + while self._cur_torch_op_node: + if self._cur_torch_op_node != self._root_torch_op_node and \ + event.start_time > self._cur_torch_op_node.end_time: + self._cur_torch_op_node = self._cur_torch_op_node.parent + continue + tree_node = TorchOpNode(event, self._cur_torch_op_node) + self._cur_torch_op_node.add_child_node(tree_node) + self._cur_torch_op_node = tree_node + break + + def update_torch_op_kernel_list(self): + top_node_list = self._root_torch_op_node.child_nodes + if not top_node_list: + return + top_node_list.sort(key=lambda x: x.start_time) + cur_index = 0 + self._kernel_self_list.sort(key=lambda x: x.get(self.ts, 0)) + for kernel_dict in self._kernel_self_list: + ts = kernel_dict.get(self.ts, 0) + kernel_list = kernel_dict.get(self.kernels, []) + while cur_index < len(top_node_list): + if ts > top_node_list[cur_index].end_time: + cur_index += 1 + continue + if ts < top_node_list[cur_index].start_time: + break + top_node_list[cur_index].update_kernel_list(kernel_list) + break diff --git a/profiler/compare_tools_review/compare_backend/utils/name_function.py b/profiler/compare_tools_review/compare_backend/utils/name_function.py new file mode 100644 index 0000000000000000000000000000000000000000..cd79e8a03fa7a970ce97ad59f14fae12766f096b --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/name_function.py @@ -0,0 +1,52 @@ +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.torch_op_node import TorchOpNode + + +class NameFunction: + def __init__(self, args: any): + self.args = args + + @classmethod + def get_name(cls, op_node: TorchOpNode) -> str: + return op_node.name + + @classmethod + def get_full_name(cls, op_node: TorchOpNode) -> str: + if isinstance(op_node.origin_input_shape, list): + data = [] + for dim in op_node.origin_input_shape: + data.append(','.join([str(x) for x in dim])) + input_shape = ';\r\n'.join(data) + return f'{op_node.name}{input_shape}' + return f'{op_node.name}{op_node.input_shape}' + + def get_name_func(self): + if not self.args.op_name_map and not self.args.use_input_shape: + name_func = self.get_name + elif self.args.op_name_map and not self.args.use_input_shape: + name_func = self.get_map_name + elif self.args.op_name_map and not self.args.use_input_shape: + name_func = self.get_full_name + else: + name_func = self.get_full_map_name + return name_func + + def get_map_name(self, op_node: TorchOpNode) -> str: + return self.args.op_name_map.get(op_node.name, op_node.name) + + def get_full_map_name(self, op_node: TorchOpNode) -> str: + if isinstance(op_node.origin_input_shape, list): + data = [] + for dim in op_node.origin_input_shape: + data.append(','.join([str(x) for x in dim])) + input_shape = ';\r\n'.join(data) + return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{input_shape}' + return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{op_node.input_shape}' + + def get_module_name(self, module: ModuleNode) -> str: + if not self.args.op_name_map: + return module.module_name + module = module.module_name + for old_name, new_name in self.args.op_name_map.items(): + module.replace(old_name, new_name) + return module diff --git a/profiler/compare_tools_review/compare_backend/utils/torch_op_node.py b/profiler/compare_tools_review/compare_backend/utils/torch_op_node.py new file mode 100644 index 0000000000000000000000000000000000000000..690c46cd51c1e2991b0bfaf44e9af431cdad5151 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/torch_op_node.py @@ -0,0 +1,92 @@ +from compare_backend.compare_bean.origin_data_bean.compare_event import MemoryEvent +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.constant import Constant + + +class TorchOpNode: + def __init__(self, event=TraceEventBean, parent_node=None): + self._event = event + self._parent_node = parent_node + self._child_nodes = [] + self._kernel_list = [] + self._kernel_num = 0 + self._memory_allocated_list = [] + + @property + def start_time(self): + return self._event.start_time + + @property + def end_time(self): + return self._event.end_time + + @property + def name(self): + return self._event.name + + @property + def input_shape(self): + return str(self._event.args.get("Input Dims", Constant.NA)) + + @property + def origin_input_shape(self): + return self._event.args.get("Input Dims", Constant.NA) + + @property + def input_type(self): + return str(self._event.args.get("Input type", Constant.NA)) + + @property + def call_stack(self): + return str(self._event.args.get("Call stack", Constant.NA)) + + @property + def parent(self): + return self._parent_node + + @property + def child_nodes(self): + return self._child_nodes + + @property + def kernel_list(self): + return self._kernel_list + + @property + def kernel_num(self): + return self._kernel_num + + @property + def memory_allocated(self): + return self._memory_allocated_list + + @property + def device_dur(self): + return sum([kernel.device_dur for kernel in self._kernel_list]) + + def add_child_node(self, child_node): + self._child_nodes.append(child_node) + + def set_kernel_list(self, kernel_list: list): + if not kernel_list: + return + self._kernel_list.extend(kernel_list) + kernel_num = len(kernel_list) + cur_node = self + while cur_node._parent_node: + cur_node._kernel_num += kernel_num + cur_node = cur_node._parent_node + + def update_kernel_list(self, kernel_list: list): + if not kernel_list: + return + self._kernel_list.extend(kernel_list) + + def set_memory_allocated(self, memory_allocated: MemoryEvent): + self._memory_allocated_list.append(memory_allocated) + + def is_step_profiler(self) -> bool: + return self._event.is_step_profiler() + + def get_op_info(self) -> list: + return [self.name, self.input_shape, self.input_type, self.call_stack] diff --git a/profiler/compare_tools_review/compare_backend/utils/tree_builder.py b/profiler/compare_tools_review/compare_backend/utils/tree_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..34c1fe1a1f4046d1e60af107f5ee74484424174a --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/utils/tree_builder.py @@ -0,0 +1,82 @@ +from queue import Queue + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.torch_op_node import TorchOpNode + + +class TreeBuilder: + @classmethod + def build_tree(cls, event_list: list, kernel_dict: dict, memory_list: list) -> TorchOpNode: + root_node = TorchOpNode() + all_event_list = [] + all_event_list.extend(event_list) + all_event_list.extend(memory_list) + all_event_list.sort(key=lambda x: x.start_time) + last_node = root_node + for event in all_event_list: + while last_node: + if last_node != root_node and event.start_time > last_node.end_time: + last_node = last_node.parent + continue + if event.is_torch_op: + tree_node = TorchOpNode(event, last_node) + last_node.add_child_node(tree_node) + last_node = tree_node + tree_node.set_kernel_list(kernel_dict.get(event.start_time, [])) + else: + event.set_name(last_node.name) + last_node.set_memory_allocated(event) + break + return root_node + + @classmethod + def get_total_kernels(cls, root_node: TorchOpNode) -> list: + result_list = [] + result_list.extend(root_node.kernel_list) + node_queue = Queue() + for child_node in root_node.child_nodes: + node_queue.put(child_node) + while not node_queue.empty(): + tree_node = node_queue.get() + result_list.extend(tree_node.kernel_list) + for child_node in tree_node.child_nodes: + node_queue.put(child_node) + return result_list + + @classmethod + def get_total_memory(cls, root_node: TorchOpNode) -> list: + result_list = [] + result_list.extend(root_node.memory_allocated) + node_queue = Queue() + for child_node in root_node.child_nodes: + node_queue.put(child_node) + while not node_queue.empty(): + tree_node = node_queue.get() + result_list.extend(tree_node.memory_allocated) + for child_node in tree_node.child_nodes: + node_queue.put(child_node) + return result_list + + @classmethod + def build_module_tree(cls, event_list: list, kernel_dict: dict): + root_node = ModuleNode(TraceEventBean({})) + event_list.sort(key=lambda x: x.start_time) + last_node = root_node + for event in event_list: + while last_node: + if last_node != root_node and event.start_time > last_node.end_time: + last_node = last_node.parent_node + continue + if event.is_x_mode(): + tree_node = ModuleNode(event, last_node) + last_node.update_child_nodes(tree_node) + last_node = tree_node + break + if last_node == root_node: + break + kernel_list = kernel_dict.get(event.start_time, []) + if kernel_list: + last_node.update_kernel_list(event.start_time, kernel_list) + break + return root_node diff --git a/profiler/compare_tools_review/compare_backend/view/__init__.py b/profiler/compare_tools_review/compare_backend/view/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_backend/view/base_view.py b/profiler/compare_tools_review/compare_backend/view/base_view.py new file mode 100644 index 0000000000000000000000000000000000000000..d18980b7de2098b5a1015d14fbd1b5be91a23bfc --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/view/base_view.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod + + +class BaseView(ABC): + def __init__(self, data_dict: dict): + self._data_dict = data_dict + + @abstractmethod + def generate_view(self): + raise NotImplementedError("Function generate_view need to be implemented.") diff --git a/profiler/compare_tools_review/compare_backend/view/excel_view.py b/profiler/compare_tools_review/compare_backend/view/excel_view.py new file mode 100644 index 0000000000000000000000000000000000000000..73b82b1cd31d7e8207e34a040e484f6387fb8694 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/view/excel_view.py @@ -0,0 +1,22 @@ +import os + +from xlsxwriter import Workbook + +from compare_backend.view.base_view import BaseView +from compare_backend.view.work_sheet_creator import WorkSheetCreator +from compare_backend.utils.constant import Constant + + +class ExcelView(BaseView): + + def __init__(self, data_dict: dict, file_path: str, args: any): + super().__init__(data_dict) + self._file_path = file_path + self._args = args + + def generate_view(self): + workbook = Workbook(self._file_path) + for sheet_name, data in self._data_dict.items(): + WorkSheetCreator(workbook, sheet_name, data, self._args).create_sheet() + workbook.close() + os.chmod(self._file_path, Constant.FILE_AUTHORITY) diff --git a/profiler/compare_tools_review/compare_backend/view/screen_view.py b/profiler/compare_tools_review/compare_backend/view/screen_view.py new file mode 100644 index 0000000000000000000000000000000000000000..150b36c6feda79cafacd7e4980624cd51e116912 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/view/screen_view.py @@ -0,0 +1,19 @@ +from prettytable import PrettyTable + +from compare_backend.view.base_view import BaseView + + +class ScreenView(BaseView): + def __init__(self, data_dict: dict): + super().__init__(data_dict) + + def generate_view(self): + for sheet_name, data in self._data_dict.items(): + if not data.get("rows", []): + return + table = PrettyTable() + table.title = sheet_name + table.field_names = data.get("headers", []) + for row in data.get("rows", []): + table.add_row(row) + print(table) diff --git a/profiler/compare_tools_review/compare_backend/view/work_sheet_creator.py b/profiler/compare_tools_review/compare_backend/view/work_sheet_creator.py new file mode 100644 index 0000000000000000000000000000000000000000..7a33168da377ae77ab64fff0886e09eef065b4e2 --- /dev/null +++ b/profiler/compare_tools_review/compare_backend/view/work_sheet_creator.py @@ -0,0 +1,60 @@ +from xlsxwriter import Workbook + +from compare_backend.utils.excel_config import ExcelConfig, CellFormatType + + +class WorkSheetCreator: + def __init__(self, work_book: Workbook, sheet_name: str, data: dict, args: any): + self._work_book = work_book + self._sheet_name = sheet_name + self._data = data + self._args = args + self._work_sheet = None + self._row_id = 1 + self._field_format = {} + self._diff_ratio_index = None + self._col_ids = "ABCDEFGHIJKLMNOPQRSTUVW" + + def create_sheet(self): + if not self._data.get("rows", []): + return + self._work_sheet = self._work_book.add_worksheet(self._sheet_name) + self._write_headers() + self._write_data() + + def _write_headers(self): + base_header_format = self._work_book.add_format(CellFormatType.GREEN_BOLD) + com_header_format = self._work_book.add_format(CellFormatType.YELLOW_BOLD) + com_index_range = [-1, -1] + overhead = self._data.get("overhead", []) + if overhead: + base_path = f"Base Profiling: {self._args.base_profiling_path}" + self._work_sheet.merge_range(overhead[0], base_path, base_header_format) + com_index_range = [self._col_ids.index(overhead[1].split(":")[0][0]), + self._col_ids.index(overhead[1].split(":")[1][0])] + comparison_path = f"Comparison Profiling: {self._args.comparison_profiling_path}" + self._work_sheet.merge_range(overhead[1], comparison_path, com_header_format) + self._row_id += 2 + for index, header in enumerate(self._data.get("headers")): + if index in range(com_index_range[0], com_index_range[1] + 1): + header_format = com_header_format + else: + header_format = base_header_format + col_id = self._col_ids[index] + self._work_sheet.set_column(f"{col_id}:{col_id}", header.get("width")) + self._work_sheet.write(f"{col_id}{self._row_id}", header.get("name"), header_format) + self._field_format[index] = self._work_book.add_format(header.get("type")) + if header.get("name") in (ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO): + self._diff_ratio_index = index + self._row_id += 1 + + def _write_data(self): + red_ratio_format = self._work_book.add_format(CellFormatType.RED_RATIO) + for data in self._data.get("rows"): + for index, cell_data in enumerate(data): + cell_format = self._field_format.get(index) + if index == self._diff_ratio_index and cell_data and cell_data > 1: + cell_format = red_ratio_format + cell_data = "INF" if cell_data == float('inf') else cell_data + self._work_sheet.write(f"{self._col_ids[index]}{self._row_id}", cell_data, cell_format) + self._row_id += 1 diff --git a/profiler/compare_tools_review/compare_interface/__init__.py b/profiler/compare_tools_review/compare_interface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools_review/compare_interface/comparison_interface.py b/profiler/compare_tools_review/compare_interface/comparison_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..919095b310126f2ce0c9c3e6912fb10f24d149e9 --- /dev/null +++ b/profiler/compare_tools_review/compare_interface/comparison_interface.py @@ -0,0 +1,31 @@ +import sys +import os + +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "cluster_analyse")) +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from compare_backend.comparison_generator import ComparisonGenerator +from compare_backend.disaggregate.overall_perf_interface import OverallPerfInterface +from compare_backend.utils.compare_args import Args +from compare_backend.utils.constant import Constant + + +class ComparisonInterface: + def __init__(self, base_profiling_path: str, comparison_profiling_path: str = ""): + self.base_profiling_path = base_profiling_path + if comparison_profiling_path: + self._args = Args(base_profiling_path=base_profiling_path, + comparison_profiling_path=comparison_profiling_path) + + def compare(self, compare_type: str) -> dict: + if compare_type == Constant.OVERALL_COMPARE: + self._args.enable_profiling_compare = True + + return ComparisonGenerator(self._args).run_interface(compare_type) + + def disaggregate_perf(self, compare_type: str) -> dict: + if compare_type != Constant.OVERALL_COMPARE: + print('[ERROR] Invalid compare_type value: {compare_type} which not supported.') + return {} + return OverallPerfInterface(self.base_profiling_path).run() diff --git a/profiler/compare_tools_review/performance_compare.py b/profiler/compare_tools_review/performance_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..4676355a28de80d73f01f75b23b102ebf4ff1a79 --- /dev/null +++ b/profiler/compare_tools_review/performance_compare.py @@ -0,0 +1,36 @@ +import argparse +import ast +import datetime +import os.path +import sys + +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "cluster_analyse")) + +from compare_backend.comparison_generator import ComparisonGenerator + + +def main(): + parser = argparse.ArgumentParser(description="Compare trace of GPU and NPU") + parser.add_argument("base_profiling_path", type=str, default='', help="基准性能数据的文件路径") + parser.add_argument("comparison_profiling_path", type=str, default='', help="比较性能数据的文件路径") + parser.add_argument("--enable_profiling_compare", default=False, action='store_true', help="开启总体性能比较") + parser.add_argument("--enable_operator_compare", default=False, action='store_true', help="开启算子性能比较") + parser.add_argument("--enable_memory_compare", default=False, action='store_true', help="开启算子内存比较") + parser.add_argument("--enable_communication_compare", default=False, action='store_true', help="开启通信性能比较") + parser.add_argument("--output_path", type=str, default='', help="性能数据比对结果的存放路径") + parser.add_argument("--max_kernel_num", type=int, help="每个torch op的kernel数量限制") + parser.add_argument("--op_name_map", type=ast.literal_eval, default={}, + help="配置GPU与NPU等价的算子名称映射关系,以字典的形式传入") + parser.add_argument("--use_input_shape", default=False, action='store_true', help="开启算子的精准匹配") + parser.add_argument("--gpu_flow_cat", type=str, default='', help="gpu flow event的分类标识") + args = parser.parse_args() + + ComparisonGenerator(args).run() + + +if __name__ == "__main__": + start_time = datetime.datetime.now() + main() + end_time = datetime.datetime.now() + print(f'[INFO] The comparison task has been completed in a total time of {end_time - start_time}') diff --git a/profiler/merge_profiling_timeline_review/README.md b/profiler/merge_profiling_timeline_review/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5075f6bc2fcc8bf04b435562f28a50229b92362e --- /dev/null +++ b/profiler/merge_profiling_timeline_review/README.md @@ -0,0 +1,115 @@ +# 合并大json工具 + +merge_profiling_timeline(合并大json工具)支持合并Profiling的timeline数据,支持合并指定rank的timline、合并指定timeline中的item。 + + +## 多timeline融合 + +### 性能数据采集 + +使用Ascend PyTorch Profiler或者E2E性能采集工具采集性能数据,E2E profiling将被废弃,不建议使用。Ascend PyTorch Profiler采集方式参考:[Profiling数据采集](https://gitee.com/ascend/att/tree/master/profiler)。将采集到的所有节点的性能数据拷贝到当前环境同一目录下,以下假设数据在/home/test/cann_profiling下。 + +E2E Profiling数据目录结构示例如下: + +```bash +|- cann_profiling + |- PROF_*** + |- timeline + |- msprof.json + |- device_* + |- info.json.* + ... + |- PROF_*** + ... +``` + +Ascend PyTorch Profiler数据目录结构示例如下: + +```bash +|- ascend_pytorch_profiling + |- **_ascend_pt + |- ASCEND_PROFILER_OUTPUT + |- trace_view.json + |- FRAMEWORK + |- PROF_*** + |- **_ascend_pt +``` + +### 参数说明 + +| 参数名称 | 说明 | 是否必选 | +| -------- | ------------------------------------------------------------ | -------- | +| -i | 指定Profiling数据目录路径。 | 是 | +| --type | 指定需要合并timeline场景,可选取值:`pytorch`(通过Ascend PyTorch Profiler方式采集profiling数据,合并所有卡的trace_view.json)、`e2e`(通过E2E Profiling方式采集Profiling数据,优先合并总timeline,没有生成则选择合并device目录下的msprof_*.json)、`custom` (自定义需要合并的timeline数据,具体参考**使用示例**)。 | 是 | +| -o | 指定合并后的timeline文件输出的路径(路径末尾可以设置文件名,具体用法参考**使用示例**),不设置该参数的情况下默认文件输出的路径为当前目录(默认文件名为merged.json)。 | 否 | +| --rank | 指定需要合并timeline的Rank ID,默认全部合并。 | 否 | +| --items | 指定需要合并的Profiling数据项,包括:python、Ascend Hardware、CANN、HCCL、PTA、Overlap Analysis,默认全部合并。 | 否 | + +### 使用示例 + +1. 合并单机多卡timeline,默认合并所有卡、所有数据项,生成first.json在path/to/cann_profiling/output/目录下 + + ```bash + python3 main.py -i path/to/cann_profiling/ -o path/to/cann_profiling/output/first --type pytorch + ``` + +2. 合并单机多卡timeline,默认合并所有卡、所有数据项,不设置-o参数时默认生成merge.json在当前目录下 + + ```bash + python3 main.py -i path/to/cann_profiling/ --type pytorch + ``` + +3. 合并单机多卡timeline,只合并0卡和1卡 + + ```bash + python3 main.py -i path/to/cann_profiling/ -o path/to/cann_profiling/output/2p --type pytorch --rank 0,1 + ``` + +4. 合并单机多卡timeline,合并所有卡的CANN层和Ascend_Hardware层数据 + + ```bash + python3 main.py -i path/to/cann_profiling/ --type pytorch --items "CANN,Ascend Hardware" + ``` + +5. 合并多timeline(自定义) + + 以上场景不支持的情况下,可以使用自定义的合并方式,将需要合并的timeline文件放在同一目录下(附:该场景比较特殊,与正常合并不同,无法直接读取info.json中的rank_id,因此该场景下的rank_id为默认分配的序号,用于区分不同文件的相同层,不代表实际rank_id) + 数据目录结构示意如下: + + ```bash + |- timeline + |- msprof_0.json + |- msprof_1.json + |- msprof_2.json + |- hccl_3.json + |- hccl_4.json + ... + ``` + + 通过下面的命令合并所有timeline,同样支持-o、--rank、--items等参数。 + + ```bash + python3 main.py -i path/to/timeline/ -o path/to/timeline/xxx --type custom + ``` + + 合并timeline查看:在 -o 指定的目录(不设置-o时默认在当前目录下的merged.json)的xxx.json为合并后的文件。 + + +## 超大timeline文件查看 + +[下载whl](https://gitee.com/aerfaliang/trace_processor/releases/download/trace_processor_37.0/trace_processor-37.0-py3-none-any.whl)包并执行如下命令安装(windows): + +```bash +pip3 install trace_processor-37.0-py3-none-any.whl +``` + +安装完成后直接执行如下命令: + +```bash +python -m trace_processor --httpd path/to/xxx_merged.json +``` + +等待加载完毕,刷新[perfetto](https://ui.perfetto.dev/)界面,单击Use old version regardless,再单击`YES, use loaded trace`即可展示timeline(通过W放大、S缩小、A左移、D右移来查看timeline文件)。 + +![输入图片说明](perfetto%E4%BD%BF%E7%94%A8%E6%8C%87%E5%AF%BC%E6%88%AA%E5%9B%BE1.png) +![输入图片说明](perfetto%E4%BD%BF%E7%94%A8%E6%8C%87%E5%AF%BC%E6%88%AA%E5%9B%BE2.png) \ No newline at end of file diff --git a/profiler/merge_profiling_timeline_review/main.py b/profiler/merge_profiling_timeline_review/main.py new file mode 100644 index 0000000000000000000000000000000000000000..678f5d5a8f7be8c45d6c4935f2941bd716d77a78 --- /dev/null +++ b/profiler/merge_profiling_timeline_review/main.py @@ -0,0 +1,233 @@ +#! /usr/bin/python3 +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re + +from functools import partial +from argparse import ArgumentParser +from decimal import Decimal + + +FILTER_DIRS = [".profiler", "HCCL_PROF", "timeline", "query", 'sqlite', 'log'] +RANK_ID_POS = 1000 + +def get_path_dir(path: str) -> list: + """ + check result path exist JOB dir + path : result path + """ + path_dir_filter = filter(partial(_path_dir_filter_func, root_dir=path), os.listdir(path)) + sub_dirs = list(path_dir_filter) + return sub_dirs + + +def _path_dir_filter_func(sub_path, root_dir): + return sub_path not in FILTER_DIRS and os.path.isdir(os.path.realpath(os.path.join(root_dir, sub_path))) + + +def natural_sort(files): + convert = lambda text: int(text) if text.isdigit() else text.lower() + alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] + return sorted(files, key=alphanum_key) + + +def get_timeline_info(args, prof_dirs): + timeline_info = {} + + for prof in prof_dirs: + pro_path = os.path.join(args.input, prof) + + # 从info.json读取rank_id + rank_id = get_rank_id_from_info_json(pro_path) + if rank_id is None: + print(f"WARN, There is not rank id info in {pro_path}") + continue + + timeline_path = get_timeline_path(pro_path, args.type) + + if os.path.exists(timeline_path): + timeline_info[rank_id] = timeline_path + else: + print(f"WARN, The file \"{timeline_path}\" does not exist.") + return timeline_info + + +def get_timeline_path(pro_path, type): + for root, dirs, files in os.walk(pro_path): + for dir_ in dirs: + if 'ASCEND_PROFILER_OUTPUT' == dir_ and type == 'pytorch': + timeline_path = os.path.realpath(os.path.join(root, dir_, 'trace_view.json')) + return timeline_path + + for file_ in sorted(files, reverse=True): + if 'msprof' in file_: + timeline_path = os.path.join(root, file_) + return timeline_path + return + +def get_rank_id_from_info_json(pro_path): + info_json = "" + rank_id = None + for root, dirs, files in os.walk(pro_path): + for file in files: + if "info.json." in file and ".done" not in file: + info_json = os.path.join(root, file) + break + + if info_json: + if os.path.islink(info_json): + print(f"The file: \"{info_json}\" is link. Please check the path.") + return + try: + with open(info_json, "r+") as f: + info = json.load(f) + rank_id = info.get("rank_id") + except Exception as err: + print("[ERROR] %s" % err) + return + return rank_id + + +def merge_timeline_general(args): + """合并e2e profiling生成的msprof*.json""" + if not os.path.isdir(args.input): + print(f"No such file or directory: \"{args.input}\". Please check the path.") + return + prof_dir = get_path_dir(args.input) + if not prof_dir: + message = f"The path \"{args.input}\" does not have PROF dir. Please check the path." + print(message) + return + timeline_info = get_timeline_info(args, prof_dir) + timeline_files_dict = {} + + # 合并部分profiling items + process_list = args.items.split(",") if args.items else None + + # 合并部分rank + if args.rank: + rank_ids = [int(rank_id) for rank_id in args.rank.split(",")] + else: + rank_ids = list(timeline_info.keys()) + + for rank_id in rank_ids: + if not timeline_info.get(rank_id): + print(f"main.py: error rank_id '{rank_id}' ") + return + timeline_files_dict[rank_id] = timeline_info.get(rank_id) + merge_timeline_events(timeline_files_dict, process_list) + + +def merge_timeline_custom(args): + """合并指定目录里所有timeline文件""" + timeline_files = natural_sort(os.listdir(args.input)) + timeline_files_dict = {} + for idx, timeline_file in enumerate(timeline_files): + timeline_files_dict[idx] = os.path.join(args.input, timeline_file) + # 合并部分profiling items + process_list = args.items.split(",") if args.items else None + merge_timeline_events(timeline_files_dict, process_list) + + +def merge_timeline_events(timeline_file_dict, process_list): + """ + 输入需要合并的timeline文件路径及对应的rank_id/id、需要合并的process_list + 输出合并timeline + """ + new_events = [] + for rank_id, timeline_path in timeline_file_dict.items(): + node = rank_id // 8 + print("rank id: ", rank_id, "timeline file: ", timeline_path) + if os.path.islink(timeline_path): + print(f"The file: \"{timeline_path}\" is link. Please check the path.") + return + try: + with open(timeline_path, 'r+') as f: + cur_events = json.load(f) + except Exception as err: + print("[ERROR] %s" % err) + return + + proc_pid_dict = {} + for event in cur_events: + if event.get("name") == "process_name" and event.get("ph") == "M": + if event.get("args"): + proc_pid_dict[event["args"].get("name")] = event.get("pid") + process_list_tmp = process_list if process_list else list(proc_pid_dict.keys()) + # 提取待合并的items的pid + merged_pids = set() + for pro in process_list_tmp: + if pro not in proc_pid_dict.keys(): + print(f"main.py: error argument --items: invalid choice: '{pro}' (choose from {list(proc_pid_dict.keys())})") + return + merged_pids.add(proc_pid_dict.get(pro)) + + for event in cur_events: + + # 只合并特定数据项 + if merged_pids and event.get('pid') not in merged_pids: + continue + + # convert tid to int + if not isinstance(event['tid'], int): + print(f"[WARNNING] {event['tid']} is not int type") + + # 进程名加上rank_id区分不同rank + if event.get("name") == "process_name" and event.get("ph") == "M": + if event.get("args") is not None and event["args"].get("name") is not None: + event["args"]["name"] = event["args"]["name"] + f"_{rank_id}" + + #modify connect id + if event.get('id') and (event.get('ph') == 's' or event.get('ph') == 'f'): + event['id'] = float(event.get('id')) * RANK_ID_POS + rank_id + + new_events.append(event) + out_path = f"{args.output}.json" + if os.path.islink(out_path): + print(f"The file: \"{out_path}\" is link. Please check the path.") + return + if os.path.exists(out_path): + print(f"File {out_path} existed before and is now overwritten.") + os.remove(out_path) + try: + # 设置文件权限为640,安全考虑 + with os.fdopen(os.open(out_path, os.O_WRONLY | os.O_CREAT, 0o640), 'w') as f: + json.dump(new_events, f) + except FileNotFoundError: + print(f"Param -o (output path) is not exists, please check it.") + return + print(f"timeline merged output path: {out_path}") + + +def parse_args(): + parser = ArgumentParser(description="Merge timeline for multi card") + parser.add_argument("-i", "--input", default=None, help="root dir of PROF_* data") + parser.add_argument("-o", "--output", default="./merged", help="save path of merged.json ") + parser.add_argument("--rank", default=None, help="List of ranks to be merged. By default, all ranks are merged") + parser.add_argument("--items", default=None, help="Specify the data items (python,CANN,Ascend Hardware,HCCL,..)to be merged. in the timeline.") + parser.add_argument("--type", choices=('pytorch', 'e2e', 'custom'), help="Customize the timeline file to be merged.") + arg = parser.parse_args() + return arg + + +if __name__ == "__main__": + args = parse_args() + print("========================== start merge timeline ====================") + if args.type == "custom": + merge_timeline_custom(args) + else: + merge_timeline_general(args) \ No newline at end of file diff --git "a/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2761.png" "b/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2761.png" new file mode 100644 index 0000000000000000000000000000000000000000..beef396ce2996c25ecd74298285ccab5011ddea1 Binary files /dev/null and "b/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2761.png" differ diff --git "a/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2762.png" "b/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2762.png" new file mode 100644 index 0000000000000000000000000000000000000000..48793f136e48f21f618ff3cb13bdcc3388f76930 Binary files /dev/null and "b/profiler/merge_profiling_timeline_review/perfetto\344\275\277\347\224\250\346\214\207\345\257\274\346\210\252\345\233\2762.png" differ diff --git a/profiler/prof_common_review/__init__.py b/profiler/prof_common_review/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/prof_common_review/analyze_dict.py b/profiler/prof_common_review/analyze_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..a06577e8fb49436f7b867e8e74495cc76a6a58b2 --- /dev/null +++ b/profiler/prof_common_review/analyze_dict.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +class AnalyzeDict(dict): + def __getstate__(self): + return self.__dict__ + + def __setstate__(self, d): + self.__dict__.update(d) + + def __getattr__(self, key: str): + if key not in self: + return {} + + value = self[key] + if isinstance(value, dict): + value = AnalyzeDict(value) + return value diff --git a/profiler/prof_common_review/constant.py b/profiler/prof_common_review/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..5789b89cb1a248977b64839339395acc5288b2ab --- /dev/null +++ b/profiler/prof_common_review/constant.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +class Constant(object): + COLLECTION_PATH = "collection_path" + ANALYSIS_MODE = "analysis_mode" + CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help']) \ No newline at end of file