diff --git a/profiler/advisor/__init__.py b/profiler/advisor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0428ee03f05fac6a068642ccd7c36d56d219ea81 --- /dev/null +++ b/profiler/advisor/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/profiler/advisor/advisor_backend/__init__.py b/profiler/advisor/advisor_backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0e9f748f4b10347a874f60cec1fa9f6e5285a5e --- /dev/null +++ b/profiler/advisor/advisor_backend/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/profiler/advisor/advisor_backend/advice_base.py b/profiler/advisor/advisor_backend/advice_base.py new file mode 100644 index 0000000000000000000000000000000000000000..35939bcea9c87fb09f2113bd19f77ea18ba54e34 --- /dev/null +++ b/profiler/advisor/advisor_backend/advice_base.py @@ -0,0 +1,50 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod + + +class AdviceBase: + DATA = "data" + BOTTLENECK = "bottleneck" + ADVICE = "advice" + + def __init__(self, collection_path: str): + self.collection_path = os.path.realpath(collection_path) + self.bottelneck = '' + self.output_format_data = { + self.DATA: [], + self.BOTTLENECK: '', + self.ADVICE: '' + } + + @abstractmethod + def path_check(self): + """ + check whether input path is valid + """ + + @abstractmethod + def run(self): + """ + analyze profiling data and advice + """ + + @abstractmethod + def output(self): + """ + output relevant data + """ \ No newline at end of file diff --git a/profiler/advisor/advisor_backend/advice_factory/__init__.py b/profiler/advisor/advisor_backend/advice_factory/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0e9f748f4b10347a874f60cec1fa9f6e5285a5e --- /dev/null +++ b/profiler/advisor/advisor_backend/advice_factory/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/profiler/advisor/advisor_backend/advice_factory/advice_factory.py b/profiler/advisor/advisor_backend/advice_factory/advice_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..b91de2864070752113babdaaeae1f5c46cf348e5 --- /dev/null +++ b/profiler/advisor/advisor_backend/advice_factory/advice_factory.py @@ -0,0 +1,42 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod + + +class AdviceFactory: + def __init__(self, collection_path: str): + self.collection_path = os.path.realpath(collection_path) + + @abstractmethod + def path_check(self): + """ + check whether input path is valid + """ + + @abstractmethod + def produce_advice(self): + """ + produce data for input mode and advice + """ + + def advice_check(self, advice: str): + """ + check whether input advice is valid + """ + if advice not in self.ADVICE_LIB.keys(): + msg = '[ERROR]Input advice is illegal.' + raise RuntimeError(msg) diff --git a/profiler/advisor/advisor_backend/advice_factory/cluster_advice_factory.py b/profiler/advisor/advisor_backend/advice_factory/cluster_advice_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..fd5e05b3e4a3d38585aa5a446ad7f7677f3c6925 --- /dev/null +++ b/profiler/advisor/advisor_backend/advice_factory/cluster_advice_factory.py @@ -0,0 +1,48 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + + +from common_func.path_manager import PathManager +from advice_factory.advice_factory import AdviceFactory +from cluster_advice.slow_link_advice import SlowLinkAdvice +from cluster_advice.slow_rank_advice import SlowRankAdvice +from cluster_advice.kernel_cluster_advice import KernelClusterAdvice +from common_func_advisor.constant import Constant + + +class ClusterAdviceFactory(AdviceFactory): + ADVICE_LIB = { + Constant.SLOW_RANK: SlowRankAdvice, + Constant.SLOW_LINK: SlowLinkAdvice, + Constant.KERNEL: KernelClusterAdvice + } + + def __init__(self, collection_path: str): + super().__init__(collection_path) + + def path_check(self): + """ + check whether input path is valid + """ + PathManager.check_input_directory_path(self.collection_path) + + def produce_advice(self, advice: str): + """ + produce data for input mode and advice + """ + self.advice_check(advice) + return self.ADVICE_LIB.get(advice)(self.collection_path).run() diff --git a/profiler/advisor/advisor_backend/cluster_advice/__init__.py b/profiler/advisor/advisor_backend/cluster_advice/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor/advisor_backend/cluster_advice/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py b/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py new file mode 100644 index 0000000000000000000000000000000000000000..8cd9acab4c43cc5eff89b6e8c3bdd3ab4a72fc4b --- /dev/null +++ b/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py @@ -0,0 +1,66 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from common_func.constant import Constant +from advice_base import AdviceBase +from cluster_analysis import Interface + + +class ClusterAdviceBase(AdviceBase): + def __init__(self, collection_path: str): + super().__init__(collection_path) + + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean + + def path_check(self): + """ + check whether input path is valid + """ + for file in os.listdir(self.collection_path): + if file == 'cluster_analysis_output': + print("[INFO]Cluster has been analyzed " + "because of the existence of cluster analysis output directory.") + print("[INFO]Skip Cluster analyze backend.") + return + print("[INFO] cluster analysis is in the process, please wait...") + self.cluster_analyze() + + def cluster_analyze(self): + parameter = { + Constant.COLLECTION_PATH: self.collection_path + } + try: + Interface(parameter).run() + except Exception as e: + raise ValueError(f"Cluster analyze backend failed:{e}") from e + + @abstractmethod + def run(self): + """ + analyze profiling data and advice + """ + + @abstractmethod + def output(self): + """ + output relevant data + """ \ No newline at end of file diff --git a/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py b/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..587c40f949be0474a5d8aeed1c16b816e633a543 --- /dev/null +++ b/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py @@ -0,0 +1,48 @@ +import os +import pandas as pd +from common_func.path_manager import PathManager +from common_func.constant import Constant +from cluster_advice.cluster_advice_base import ClusterAdviceBase +from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor + + +class KernelClusterAdvice(ClusterAdviceBase): + COLUMNS_TO_GROUP = ["Name", "Input Shapes", "Input Data Types", "Output Shapes"] + COLUMNS_TO_CAL = ["Duration(us)"] + CAL_FUN = ['mean', 'var', 'max', 'min', 'count', 'sum'] + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.all_kernel_data = pd.DataFrame() + + def run(self): + self.load_kernel_details_data() + return self.calculate_data() + + def load_kernel_details_data(self): + data_map = PytorchDataPreprocessor(self.collection_path).get_data_map() + self.all_kernel_data = pd.DataFrame() + for rank_id, profiling_dir_path in data_map.items(): + kernel_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.KERNEL_DETAILS_CSV) + if kernel_file: + # 判断csv文件大小 + PathManager.check_path_readable(kernel_file) + # 读取CSV文件 + df_temp = pd.read_csv(kernel_file) + columns_to_keep = self.COLUMNS_TO_GROUP + self.COLUMNS_TO_CAL + if [1 for element in columns_to_keep if element not in list(df_temp)]: + msg = "[ERROR] Kernel details.csv has wrong data columns, terminate analysis." + raise RuntimeError(msg) + df = df_temp[columns_to_keep] + df.insert(loc=0, column='rank id', value=rank_id) + # 将数据添加到最终的数据框中 + self.all_kernel_data = self.all_kernel_data._append(df, ignore_index=True) + + def calculate_data(self): + # 存储所有合并后的数据 + calculate_dict = {self.COLUMNS_TO_CAL[i]: self.CAL_FUN + for i in range(len(self.COLUMNS_TO_CAL))} + group_col = ["rank id"] + self.COLUMNS_TO_GROUP + view_data = self.all_kernel_data.groupby(group_col).agg(calculate_dict).reset_index() + view_data.columns = [''.join(col) if col[1] == "" else '_'.join(col) for col in view_data.columns] + return view_data diff --git a/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py b/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..62050a87c1841035302989eb87f11975c27fbf45 --- /dev/null +++ b/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py @@ -0,0 +1,110 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import defaultdict +from common_func_advisor.constant import Constant +from common_func.file_manager import FileManager +from cluster_advice.cluster_advice_base import ClusterAdviceBase + + +class SlowLinkAdvice(ClusterAdviceBase): + RDMA_TIME_MS = "RDMA time(ms)" + RDMA_SIZE_MB = "RDMA size(mb)" + SDMA_TIME_MS = "SDMA time(ms)" + SDMA_SIZE_MB = "SDMA size(mb)" + RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)" + SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)" + COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info" + TRANSIT_TIME = "Transit Time(ms)" + TRANSIT_SIZE = "Transit Size(MB)" + SDMA = "SDMA" + RDMA = "RDMA" + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.rank_bw_dict = defaultdict(lambda: { + self.RDMA_TIME_MS: 0, + self.RDMA_SIZE_MB: 0, + self.SDMA_TIME_MS: 0, + self.SDMA_SIZE_MB: 0, + }) + + @staticmethod + def compute_ratio(dividend: float, divisor: float): + if abs(divisor) < 1e-15: + return 0 + else: + return round(dividend / divisor, 4) + + def load_communication_json(self): + json_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT, Constant.CLUSTER_COMM_JSON) + if not os.path.exists(json_path): + msg = "[ERROR] cluster_communication.json doesn't exist, terminate analysis." + raise RuntimeError(msg) + communication_json = FileManager.read_json_file(json_path) + return communication_json + + def run(self): + self.path_check() + communication_json = self.load_communication_json() + self.process(communication_json) + self.output() + return self.output_format_data + + def process(self, communication_json: dict): + for comm_group, group_dict in communication_json.items(): + for step, step_dict in group_dict.items(): + for op, op_dict in step_dict.items(): + self.compute_bandwidth(op_dict) + if self.rank_bw_dict: + self.produce_bottleneck(self.RDMA_BANDWIDTH) + self.produce_bottleneck(self.SDMA_BANDWIDTH) + + def compute_bandwidth(self, op_dict: dict): + for rank_id, rank_dict in op_dict.items(): + try: + rank = int(rank_id) + except ValueError as e: + msg = "[ERROR] Cluster_communication.json has invalid structure." + raise ValueError(msg) from e + for comm_type, bw_dict in rank_dict.get(self.COMMUNICATION_BANDWIDTH_INFO, {}).items(): + if comm_type == self.SDMA: + self.rank_bw_dict[rank][self.SDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_TIME) + self.rank_bw_dict[rank][self.SDMA_TIME_MS] += bw_dict.get(self.TRANSIT_SIZE) + if comm_type == self.RDMA: + self.rank_bw_dict[rank][self.RDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_TIME) + self.rank_bw_dict[rank][self.RDMA_TIME_MS] += bw_dict.get(self.TRANSIT_SIZE) + + for rank, rank_dict in self.rank_bw_dict.items(): + self.rank_bw_dict[rank][self.RDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[rank][self.RDMA_SIZE_MB], self.rank_bw_dict[rank][self.RDMA_TIME_MS]) + self.rank_bw_dict[rank][self.SDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[rank][self.SDMA_SIZE_MB], self.rank_bw_dict[rank][self.SDMA_TIME_MS]) + + def produce_bottleneck(self, link_type: str): + data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()] + avg_bw = round(sum(data_list) / len(data_list), 3) + if avg_bw == 0: + return + self.bottelneck += f'{link_type}: \n' \ + f'The average is {avg_bw}, ' \ + f'while the maximum is {round(max(data_list), 3)}GB/s and ' \ + f'the minimum is {round(min(data_list), 3)}GB/s. ' \ + f'the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n' + + def output(self): + self.output_format_data[self.DATA] = self.rank_bw_dict + self.output_format_data[self.BOTTLENECK] = self.bottelneck diff --git a/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py b/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..516554583240878f211dba01d4f92c0a17a79cdc --- /dev/null +++ b/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py @@ -0,0 +1,71 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import defaultdict +from common_func_advisor.constant import Constant +from common_func.file_manager import FileManager +from cluster_advice.cluster_advice_base import ClusterAdviceBase +from prof_bean_advisor.cluster_step_trace_time_bean import ClusterStepTraceTimeBean + + +class SlowRankAdvice(ClusterAdviceBase): + RANK = "rank" + RATIO_THRESHOLD = 0.05 + BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] + + def __init__(self, collection_path: str): + super().__init__(collection_path) + + def load_step_time(self): + csv_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT, Constant.CLUSTER_STEP_TIME_CSV) + if not os.path.exists(csv_path): + msg = "[ERROR] cluster_step_trace_time.csv doesn't exist, terminate analysis." + raise RuntimeError(msg) + step_time = FileManager.read_csv_file(csv_path, ClusterStepTraceTimeBean) + return step_time + + def run(self): + self.path_check() + step_data = self.load_step_time() + step_dict = self.process(step_data) + self.output(step_dict) + return self.output_format_data + + def process(self, step_data: list): + step_dict = defaultdict(lambda: [0, 0, 0, 0]) + for step_bean in step_data: + if step_bean.type == self.RANK: + step_dict[step_bean.index][0] += step_bean.compute + step_dict[step_bean.index][1] += step_bean.communication + step_dict[step_bean.index][2] += step_bean.free + total_time_list = [sum(data_tuple) for rank_id, data_tuple in step_dict.items()] + if total_time_list: + mean_total_time = sum(total_time_list) / len(total_time_list) + for i in range(len(self.BOTTLENECK_LIST)): + self.produce_bottleneck(step_dict, i, mean_total_time) + return step_dict + + def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float): + data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()] + max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time) + if max_ratio > self.RATIO_THRESHOLD: + self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} has some issues in the cluster, ' \ + f'because the max difference of {self.BOTTLENECK_LIST[produce_type]} time ' \ + f'has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n' + + def output(self, step_dict: dict): + self.output_format_data[self.DATA] = step_dict + self.output_format_data[self.BOTTLENECK] = self.bottelneck diff --git a/profiler/advisor/advisor_backend/common_func_advisor/__init__.py b/profiler/advisor/advisor_backend/common_func_advisor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor/advisor_backend/common_func_advisor/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor/advisor_backend/common_func_advisor/constant.py b/profiler/advisor/advisor_backend/common_func_advisor/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..9cf2080acf88201673628ae7d2ae4baef6cde7c2 --- /dev/null +++ b/profiler/advisor/advisor_backend/common_func_advisor/constant.py @@ -0,0 +1,33 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class Constant: + MAX_INPUT_MODE_LEN = 30 + MAX_INPUT_ADVICE_LEN = 30 + + # mode list + COMPUTE = "compute" + TIMELINE = "timeline" + CLUSTER = "cluster" + + # advice list + SLOW_RANK = "slow rank" + SLOW_LINK = "slow link" + KERNEL = "kernel" + + COLLECTION_PATH = "collection_path" + CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output" + CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv" + CLUSTER_COMM_JSON = "cluster_communication.json" diff --git a/profiler/advisor/advisor_backend/interface.py b/profiler/advisor/advisor_backend/interface.py new file mode 100644 index 0000000000000000000000000000000000000000..e4d3bf5639d116031a8f1971a0b326e0461b1f1d --- /dev/null +++ b/profiler/advisor/advisor_backend/interface.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "advisor_backend")) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "cluster_analyse")) +from common_func_advisor.constant import Constant +from advisor_backend.advice_factory.cluster_advice_factory import ClusterAdviceFactory + + +class Interface: + def __init__(self, collection_path: str): + self.collection_path = os.path.realpath(collection_path) + self._factory_controller = FactoryController(collection_path) + + def get_data(self: any, mode: str, advice: str, input_path=None): + if len(mode) > Constant.MAX_INPUT_MODE_LEN or len(advice) > Constant.MAX_INPUT_ADVICE_LEN: + msg = '[ERROR]Input Mode is illegal.' + raise RuntimeError(msg) + factory = self._factory_controller.create_advice_factory(mode, input_path) + return factory.produce_advice(advice) + + +class FactoryController: + FACTORY_LIB = { + Constant.CLUSTER: ClusterAdviceFactory + } + + def __init__(self, collection_path: str): + self.collection_path = os.path.realpath(collection_path) + self.temp_input_path = None + + def create_advice_factory(self, mode: str, input_path: str): + if input_path: + return self.FACTORY_LIB.get(mode)(input_path) + else: + return self.FACTORY_LIB.get(mode)(self.collection_path) + + +if __name__ == "__main__": + Interface() \ No newline at end of file diff --git a/profiler/advisor/advisor_backend/prof_bean_advisor/__init__.py b/profiler/advisor/advisor_backend/prof_bean_advisor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8400fd5ecd1246eaee795cebfccfacc80a94f08c --- /dev/null +++ b/profiler/advisor/advisor_backend/prof_bean_advisor/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/advisor/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py b/profiler/advisor/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..6adbe07be53dc13ab51b0ed862dfc7121d588892 --- /dev/null +++ b/profiler/advisor/advisor_backend/prof_bean_advisor/cluster_step_trace_time_bean.py @@ -0,0 +1,76 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class ClusterStepTraceTimeBean: + STEP = "Step" + TYPE = "Type" + INDEX = "Index" + COMPUTING = "Computing" + COMMUNICATION = "Communication(Not Overlapped)" + FREE = "Free" + + def __init__(self, data: dict): + self._data = data + + @property + def row(self) -> list: + row = [] + for field_name in self._data.keys(): + if field_name == self.STEP: + continue + row.append(float(self._data.get(field_name, ))) + return row + + @property + def step(self) -> str: + return self._data.get(self.STEP, '') + + @property + def type(self) -> str: + return self._data.get(self.TYPE, '') + + @property + def index(self) -> int: + try: + return int(self._data.get(self.INDEX)) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Index'." + raise ValueError(msg) from e + + @property + def compute(self) -> float: + try: + return float(self._data.get(self.COMPUTING, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Computing'." + raise ValueError(msg) from e + + @property + def communication(self) -> float: + try: + return float(self._data.get(self.COMMUNICATION, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Communication'." + raise ValueError(msg) from e + + @property + def free(self) -> float: + try: + return float(self._data.get(self.FREE, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Free'." + raise ValueError(msg) from e + diff --git a/profiler/cluster_analyse/cluster_analysis.py b/profiler/cluster_analyse/cluster_analysis.py index 99f474b9a585d22632f9742c8fc624af25276539..ed69b098538c2eb94731ee1ff5d09986b233c18e 100644 --- a/profiler/cluster_analyse/cluster_analysis.py +++ b/profiler/cluster_analyse/cluster_analysis.py @@ -24,8 +24,8 @@ from analysis.analysis_facade import AnalysisFacade class Interface: - def __init__(self, args: argparse.Namespace): - self.collection_path = PathManager.get_realpath(args.collection_path) + def __init__(self, params: dict): + self.collection_path = PathManager.get_realpath(params.get(Constant.COLLECTION_PATH)) self.data_map = {} self.communication_group = {} self.collective_group_dict = {} @@ -53,4 +53,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="cluster analysis module") parser.add_argument('-d', '--collection_path', type=str, required=True, help="profiling data path") args_parsed = parser.parse_args() - Interface(args_parsed).run() + parameter = { + Constant.COLLECTION_PATH: args_parsed.collection_path + } + Interface(parameter).run() diff --git a/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py b/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py index 9ee4ad12cbd083e4bf36ca5f8ba527b54c4dfa0f..d7492a0e98efc255972df58ef3d04b0c38ee9949 100644 --- a/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py +++ b/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py @@ -31,6 +31,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from common_func.path_manager import PathManager + MAX_READ_FILE_BYTES = 64 * 1024 * 1024 @@ -262,12 +263,11 @@ class StatisticalInfoToHtmlAnalyzer(OpSummaryAnalyzerBase): class DeliverableGenerator: - def __init__(self, args): - self.args = args - self.formProcess = FormDataProcessor(args.dir, 'op_summary*.csv') + def __init__(self, params): + self.dirs = params.get('dir') self.analyzers = [] self.columns_to_keep = [] - self.setAnalyzers(args) + self.setAnalyzers(params) self.setColumnsToKeep() def run(self): @@ -280,19 +280,19 @@ class DeliverableGenerator: for analyzer in self.analyzers: analyzer.GenerateDeliverable(summary_data, rank_num) - def setAnalyzers(self, args): + def setAnalyzers(self, params): chip_type = self.formProcess.getChipType() # 判断该路径是不是软链接,并修改为绝对路径 - if os.path.islink(args.dir): - print(f"The file: \"{args.dir}\" is link. Please check the path.") + if os.path.islink(params.get('dir')): + print(f"The file: \"{params.get('dir')}\" is link. Please check the path.") return - prof_path = os.path.realpath(args.dir) + prof_path = os.path.realpath(params.get('dir')) PathManager.input_path_common_check(prof_path) - if args.type == "all": - self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path), StatisticalInfoToHtmlAnalyzer(chip_type, args.top_n, prof_path)] - elif args.type == "html": - self.analyzers = [StatisticalInfoToHtmlAnalyzer(chip_type, args.top_n, prof_path)] - elif args.type == "csv": + if params.get('type') == "all": + self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path), StatisticalInfoToHtmlAnalyzer(chip_type, params.get("top_n"), prof_path)] + elif params.get('type') == "html": + self.analyzers = [StatisticalInfoToHtmlAnalyzer(chip_type, params.get("top_n"), prof_path)] + elif params.get('type') == "csv": self.analyzers = [TimeToCsvAnalyzer(chip_type, prof_path)] else: warnings.warn("参数错误,请输入 all html csv 这三种类型") # 发出一个警告信息 @@ -313,8 +313,13 @@ def main(): parser.add_argument("--top_n", "-n", default=10, help="how many operators to show", type=int) parser.add_argument("--type", "-t", default='html', help="compare ratio or aicore-time", type=str) args = parser.parse_args() + params = { + "dir": args.dir, + "top_n": args.top_n, + "type": args.type + } - deviverable_gen = DeliverableGenerator(args) + deviverable_gen = DeliverableGenerator(params) deviverable_gen.run() if __name__ == "__main__": diff --git a/profiler/cluster_analyse/common_func/constant.py b/profiler/cluster_analyse/common_func/constant.py index 5b4bc148f78b146dd32d2a3339116eef9591a7fc..5ca830edef47ea3026c2c24fa9bf943ed821f006 100644 --- a/profiler/cluster_analyse/common_func/constant.py +++ b/profiler/cluster_analyse/common_func/constant.py @@ -22,6 +22,7 @@ class Constant(object): COMM_JSON = "communication.json" COMM_MATRIX_JSON = "communication_matrix.json" STEP_TIME_CSV = "step_trace_time.csv" + KERNEL_DETAILS_CSV = "kernel_details.csv" # file authority FILE_AUTHORITY = 0o640 diff --git a/profiler/cluster_analyse/common_func/file_manager.py b/profiler/cluster_analyse/common_func/file_manager.py index b518556fc79024823edd96c6dc51eb385de4679b..3853c806f92de1d8da14e32105fcc869789a9a40 100644 --- a/profiler/cluster_analyse/common_func/file_manager.py +++ b/profiler/cluster_analyse/common_func/file_manager.py @@ -109,9 +109,9 @@ class FileManager: suffix = os.path.splitext(file_path) base_name = os.path.join(file_path) if suffix == Constant.CSV_SUFFIX: - limit_size = Constant.MAX_JSON_SIZE - else: limit_size = Constant.MAX_CSV_SIZE + else: + limit_size = Constant.MAX_JSON_SIZE file_size = os.path.getsize(file_path) if file_size > limit_size: raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.")