From 48850a437000f3f38c3d184feee3e496ec7b7bc2 Mon Sep 17 00:00:00 2001 From: fanglanyue Date: Mon, 23 Dec 2024 16:09:02 +0800 Subject: [PATCH] cluster analyze codecheck --- profiler/affinity_cpu_bind/bind_core.py | 49 +++++++++++++------ profiler/cli/analyze_cli.py | 17 ++++++- profiler/cli/compare_cli.py | 2 +- profiler/cli/complete_cli.py | 14 ++++++ .../cluster_analyse/analysis/base_analysis.py | 1 + .../analysis/comm_matrix_analysis.py | 3 +- .../analysis/communication_analysis.py | 8 +-- .../analysis/host_info_analysis.py | 5 +- .../analysis/step_trace_time_analysis.py | 16 +++--- profiler/cluster_analyse/cluster_analysis.py | 2 +- .../cluster_prof_Info_analysis.py | 9 ++-- .../cluster_utils/data_transfer_adapter.py | 23 +++++++-- .../cluster_analyse/common_func/db_manager.py | 32 ++++++------ .../common_func/empty_class.py | 14 ++++++ .../common_func/table_constant.py | 14 ++++++ .../common_func/tables_config.py | 14 ++++++ .../base_communication_group.py | 4 +- .../communication_db_group.py | 7 +-- .../communication_group_generator.py | 2 +- .../communication_json_group.py | 2 +- .../prof_bean/step_trace_time_bean.py | 1 + .../example/mstx_torch_plugin/__init__.py | 14 ++++++ .../mstx_torch_plugin/mstx_torch_plugin.py | 16 +++++- profiler/example/setup.py | 14 ++++++ profiler/prof_common/__init__.py | 14 ++++++ profiler/prof_common/file_manager.py | 4 +- profiler/prof_common/utils.py | 14 ++++++ profiler/setup.py | 16 ++++++ 28 files changed, 264 insertions(+), 67 deletions(-) diff --git a/profiler/affinity_cpu_bind/bind_core.py b/profiler/affinity_cpu_bind/bind_core.py index 23d72c8b1..e7799b2f3 100644 --- a/profiler/affinity_cpu_bind/bind_core.py +++ b/profiler/affinity_cpu_bind/bind_core.py @@ -1,3 +1,18 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import subprocess import argparse import os @@ -48,7 +63,7 @@ class BindCoreManager(): self.args_parse() if not bind_core_manager.get_npu_info(): - print('[ERROR] Failed to get current npus info') + logging.error('Failed to get current npus info') exit() if not bind_core_manager.get_running_pid_on_npu(): exit() @@ -56,7 +71,6 @@ class BindCoreManager(): bind_core_manager.run_bind_core() def get_running_pid_on_npu(self) -> bool: - no_running_pids_on_npu_msg = '[INFO] Now there is no running process on all NPUs, stop bind cores' logging.info('Begin to find running process on all NPUs') # get running process on NPUs for _ in range(self.find_running_pid_times): @@ -102,7 +116,7 @@ class BindCoreManager(): logging.info('Succeed to find running process %s on NPU %d', pids, npu_id) if_running_process = True if not if_running_process: - print(no_running_pids_on_npu_msg) + logging.info('Now there is no running process on all NPUs, stop bind cores') return if_running_process def get_npu_info(self) -> bool: @@ -129,14 +143,17 @@ class BindCoreManager(): p = subprocess.run(set_affinity_cpu_cmd.split(), shell=False, capture_output=True) logging.info(p.stdout.decode('utf-8')) except subprocess.CalledProcessError: - print('[ERROR] Failed to bind process {} on NPU {} with cpu cores list {}'.format(pid, npu, affinity_cpu)) + logging.error('Failed to bind process {} on NPU {} with cpu cores list {}'.format(pid, npu, + affinity_cpu)) logging.info('Succeed to bind process %s on NPU %d with cpu cores list %s', pid, npu, affinity_cpu) def args_parse(self): parser = argparse.ArgumentParser(description='This is a affinity cpu core bind script.') - parser.add_argument('-t', '--time', type=int, metavar='', help='Wait time before bind cores that you want to set. The unit is \'s\'.') - parser.add_argument('-app', '--application', metavar='', nargs='+', help='Training or inference command that you want to run.') + parser.add_argument('-t', '--time', type=int, metavar='', + help='Wait time before bind cores that you want to set. The unit is \'s\'.') + parser.add_argument('-app', '--application', metavar='', nargs='+', + help='Training or inference command that you want to run.') args = parser.parse_args() if args.application: application_cmd = ' '.join(args.application) @@ -148,7 +165,7 @@ class BindCoreManager(): args.time = 0 msg = f"Invalid parameter. The value of --time is not within the range " \ f"[0, {BindCoreManager.MAX_WAIT_TIME_BEFORE_BIND_CORE}]. --time has been set to 0 to continue." - print(f'[WARNING] {msg}') + logging.warning(f'{msg}') time.sleep(args.time) def _init_log_file(self): @@ -175,7 +192,8 @@ class BindCoreManager(): get_npu_info_cmd = 'npu-smi info -l' get_npu_info_process = subprocess.run(get_npu_info_cmd.split(), shell=False, capture_output=True) get_npu_id_cmd = 'grep ID' - get_npu_id_process = subprocess.run(get_npu_id_cmd.split(), shell=False, input=get_npu_info_process.stdout, capture_output=True) + get_npu_id_process = subprocess.run(get_npu_id_cmd.split(), shell=False, input=get_npu_info_process.stdout, + capture_output=True) res = get_npu_id_process.stdout.decode('utf-8').split() for i in res: if i.isdigit(): @@ -189,7 +207,8 @@ class BindCoreManager(): p = subprocess.run(get_npu_topo_cmd.split(), shell=False, capture_output=True) res = p.stdout.decode('utf-8').split() if not res: - print('[ERROR] Failed to run get npu affinity info, please check if driver version support cmd npu-smi info -t topo') + logging.error('Failed to run get npu affinity info, ' + 'please check if driver version support cmd npu-smi info -t topo') return False index = 0 @@ -205,10 +224,12 @@ class BindCoreManager(): cpus[1] = str(int(cpus[1]) + cpu_num_for_each_npu) affinity_cpus.append(cpus[0] + '-' + cpus[1]) if index < len(self.npu_id_list): - self.npu_affinity_cpu_dict[self.npu_id_list[index]] = ','.join(affinity_cpu for affinity_cpu in affinity_cpus) + self.npu_affinity_cpu_dict[self.npu_id_list[index]] = ','.join( + affinity_cpu for affinity_cpu in affinity_cpus) index += 1 else: - print('[ERROR] Get affinity_cpu_list for {} npus, more than real npu num: {}'.format(index + 1, len(self.npu_id_list))) + logging.error('Get affinity_cpu_list for {} npus, ' + 'more than real npu num: {}'.format(index + 1, len(self.npu_id_list))) return False for k in self.npu_affinity_cpu_dict.keys(): @@ -217,12 +238,12 @@ class BindCoreManager(): if __name__ == '__main__': - print('[INFO] Begin to run bind-cores script...') + logging.info('Begin to run bind-cores script...') bind_core_manager = BindCoreManager() try: bind_core_manager.run() except Exception as exception: - print(f'[ERROR] {exception}') + logging.error(f'{exception}') - print('[INFO] End to run bind-cores script, the log is saved in {}'.format(bind_core_manager.log_file)) + logging.info('End to run bind-cores script, the log is saved in {}'.format(bind_core_manager.log_file)) diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py index 0241a27aa..5b2e85e16 100644 --- a/profiler/cli/analyze_cli.py +++ b/profiler/cli/analyze_cli.py @@ -1,7 +1,22 @@ -import click +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import os import logging +import click sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), "compare_tools")) sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), "cluster_analyse")) diff --git a/profiler/cli/compare_cli.py b/profiler/cli/compare_cli.py index 1551ee651..e954bd150 100644 --- a/profiler/cli/compare_cli.py +++ b/profiler/cli/compare_cli.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import ast -import click import os import sys +import click sys.path.append(os.path.dirname(os.path.dirname(__file__))) diff --git a/profiler/cli/complete_cli.py b/profiler/cli/complete_cli.py index 9c4efd0af..78aea6cf8 100644 --- a/profiler/cli/complete_cli.py +++ b/profiler/cli/complete_cli.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import click from profiler.advisor.utils.tools import CONTEXT_SETTINGS diff --git a/profiler/cluster_analyse/analysis/base_analysis.py b/profiler/cluster_analyse/analysis/base_analysis.py index 398098117..a20a5e6da 100644 --- a/profiler/cluster_analyse/analysis/base_analysis.py +++ b/profiler/cluster_analyse/analysis/base_analysis.py @@ -22,6 +22,7 @@ from cluster_utils.data_transfer_adapter import DataTransferAdapter logger = logging.getLogger() + class BaseAnalysis: MAX_RANKS = 1000 diff --git a/profiler/cluster_analyse/analysis/comm_matrix_analysis.py b/profiler/cluster_analyse/analysis/comm_matrix_analysis.py index 3df349ee8..522d9db9f 100644 --- a/profiler/cluster_analyse/analysis/comm_matrix_analysis.py +++ b/profiler/cluster_analyse/analysis/comm_matrix_analysis.py @@ -18,10 +18,11 @@ import logging from collections import defaultdict from analysis.base_analysis import BaseAnalysis -from profiler.prof_common.constant import Constant from common_func.db_manager import DBManager from common_func.utils import increase_shared_value +from profiler.prof_common.constant import Constant + logger = logging.getLogger("cluster") diff --git a/profiler/cluster_analyse/analysis/communication_analysis.py b/profiler/cluster_analyse/analysis/communication_analysis.py index 0cac56951..8dece4339 100644 --- a/profiler/cluster_analyse/analysis/communication_analysis.py +++ b/profiler/cluster_analyse/analysis/communication_analysis.py @@ -245,14 +245,14 @@ class CommunicationAnalysisOptimized(BaseAnalysis): return total_bw_info, total_transit_size, total_transit_time for _, step_dict in self._aggregate_bandwidth.items(): - for step_id, rank_dict in step_dict.items(): - for rank_id, communication_op_info in rank_dict.items(): - for transport_type, bandwidth_info in communication_op_info.items(): + for _, rank_dict in step_dict.items(): + for _, communication_op_info in rank_dict.items(): + for _, bandwidth_info in communication_op_info.items(): total_transit_size = 0.0 total_transit_time = 0.0 total_info = [] op_group_set = set() - for package_size, package_info in bandwidth_info.items(): + for _, package_info in bandwidth_info.items(): total_bandwidth_info, total_transit_size, total_transit_time = process_package_info( package_info, total_transit_size, total_transit_time, op_group_set ) diff --git a/profiler/cluster_analyse/analysis/host_info_analysis.py b/profiler/cluster_analyse/analysis/host_info_analysis.py index 4d1c8889b..3a6cfb8a5 100644 --- a/profiler/cluster_analyse/analysis/host_info_analysis.py +++ b/profiler/cluster_analyse/analysis/host_info_analysis.py @@ -17,10 +17,11 @@ import os import logging from analysis.base_analysis import BaseAnalysis -from profiler.prof_common.constant import Constant from common_func.db_manager import DBManager from common_func.utils import increase_shared_value +from profiler.prof_common.constant import Constant + logger = logging.getLogger("cluster") @@ -101,4 +102,4 @@ class HostInfoAnalysis(BaseAnalysis): self.all_rank_host_info[host_uid] = host_name self.all_rank_device_info.extend(rank_device_info) if print_empty_host_info: - print(print_empty_host_info) + logger.warning(print_empty_host_info) diff --git a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py index db0b9b5b0..82cae80b2 100644 --- a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py +++ b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py @@ -18,10 +18,11 @@ import logging from common_func.db_manager import DBManager from common_func.utils import increase_shared_value +from cluster_utils.parallel_strategy_calculator import ParallelStrategyCalculator +from prof_bean.step_trace_time_bean import StepTraceTimeBean + from profiler.prof_common.constant import Constant from profiler.prof_common.file_manager import FileManager -from prof_bean.step_trace_time_bean import StepTraceTimeBean -from cluster_utils.parallel_strategy_calculator import ParallelStrategyCalculator logger = logging.getLogger("cluster") @@ -77,18 +78,19 @@ class StepTraceTimeAnalysis: if len(parallelism_map) > len(self.step_time_dict): missing_rank_ids = [ - rank_id for rank_id in range(len(parallelism_map)) + rank_id + for rank_id in range(len(parallelism_map)) if rank_id not in self.step_time_dict ] logger.warning("Step trace data length should equal to real rank numbers, but get step data length =" "%s, real rank numbers = %s, maybe lost some rank ids = %s, please check your profiling " - "data.",str(len(self.step_time_dict)),str(len(parallelism_map)),str(missing_rank_ids)) + "data.", (len(self.step_time_dict)), str(len(parallelism_map)), str(missing_rank_ids)) if len(parallelism_map) < len(self.step_time_dict): logger.error("Step trace data length should equal to real rank numbers, but get step data length = %s," " real rank numbers = %s, maybe parallel params in profiler_metadata.json is error, " "please check your metadata data.", - str(len(self.step_time_dict)),str(len(parallelism_map))) + str(len(self.step_time_dict)), str(len(parallelism_map))) self.distributed_args = None return @@ -146,8 +148,8 @@ class StepTraceTimeAnalysis: self.step_time_dict[rank_id] = data DBManager.destroy_db_connect(conn, cursor) if not self.step_time_dict.get(rank_id): - logger.warning("Rank %s does not have a valid step_trace_time data in %s file." - ,str(rank_id),str(self.data_type)) + logger.warning("Rank %s does not have a valid step_trace_time data in %s file.", + str(rank_id), str(self.data_type)) def analyze_step_time(self): for rank_id, data_bean_list in self.step_time_dict.items(): diff --git a/profiler/cluster_analyse/cluster_analysis.py b/profiler/cluster_analyse/cluster_analysis.py index 6ab68fe17..42d5be474 100644 --- a/profiler/cluster_analyse/cluster_analysis.py +++ b/profiler/cluster_analyse/cluster_analysis.py @@ -19,10 +19,10 @@ import logging import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +from analysis.analysis_facade import AnalysisFacade from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor from cluster_data_preprocess.mindspore_data_preprocessor import MindsporeDataPreprocessor from communication_group.communication_group_generator import CommunicationGroupGenerator -from analysis.analysis_facade import AnalysisFacade from profiler.prof_common.additional_args_manager import AdditionalArgsManager from profiler.prof_common.constant import Constant from profiler.prof_common.file_manager import FileManager diff --git a/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py b/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py index 3af835e6d..64c1d27b0 100644 --- a/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py +++ b/profiler/cluster_analyse/cluster_kernels_analysis/cluster_prof_Info_analysis.py @@ -51,7 +51,8 @@ class FormDataProcessor: node_id = re.search(r'node(\d+)', dir_path).group(1) return int(node_id) - def get_files_with_prefix_recursive(self, csv_path, match_str): + @staticmethod + def get_files_with_prefix_recursive(csv_path, match_str): matched_ir_files = list(Path(csv_path).rglob(match_str)) if not matched_ir_files: msg = f"Didn't find any file in folder {csv_path} that matches {match_str}" @@ -186,7 +187,7 @@ class OpSummaryAnalyzerBase: def get_columns_to_view(self): return self.columns_to_view - def calculateViewData(self, summary_data): + def calculate_view_data(self, summary_data): # 存储所有合并后的数据 calculate_dict = {self.columns_to_view[i]: self.calculate_fun for i in range(len(self.columns_to_view))} view_data = summary_data.groupby(self.attrs_to_group).agg(calculate_dict).reset_index() @@ -198,7 +199,7 @@ class TimeToCsvAnalyzer(OpSummaryAnalyzerBase): super().__init__(chip_type, "TimeToCsvAnalyzer", dir_path) def generate_deliverable(self, summary_data, rank_num): - view_data = self.calculateViewData(summary_data) + view_data = self.calculate_view_data(summary_data) # 规范化列名 view_data.columns = [''.join(col) if col[1] == "" else '_'.join(col) for col in view_data.columns] try: @@ -221,7 +222,7 @@ class StatisticalInfoToHtmlAnalyzer(OpSummaryAnalyzerBase): # top_n 如果不符合要求,报警告 def generate_deliverable(self, summary_data, rank_num): - view_data = self.calculateViewData(summary_data) + view_data = self.calculate_view_data(summary_data) # 规范化列名 op_name/ --> op_name time/var 这种不变 view_data.columns = [''.join(col) if col[1] == "" else col for col in view_data.columns] diff --git a/profiler/cluster_analyse/cluster_utils/data_transfer_adapter.py b/profiler/cluster_analyse/cluster_utils/data_transfer_adapter.py index c2aed8250..2de1ca959 100644 --- a/profiler/cluster_analyse/cluster_utils/data_transfer_adapter.py +++ b/profiler/cluster_analyse/cluster_utils/data_transfer_adapter.py @@ -1,7 +1,20 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import copy - -from profiler.prof_common.constant import Constant from common_func.table_constant import TableConstant +from profiler.prof_common.constant import Constant class DataTransferAdapter(object): @@ -85,8 +98,8 @@ class DataTransferAdapter(object): res_dict[TableConstant.GROUP_NAME] = op_name.split("@")[1] if "@" in op_name else "" return res_dict - for rank_set, step_dict in res_data.items(): - for step, op_dict in step_dict.items(): + for _, step_dict in res_data.items(): + for _, op_dict in step_dict.items(): for op_name, op_data in op_dict.items(): split_comm_time() return res_comm_data, res_bd_data @@ -137,7 +150,7 @@ class DataTransferAdapter(object): self.set_value_by_key(matrix_data, link_data, key_dict) result.append(matrix_data) - for rank_set, step_dict in res_data.items(): + for _, step_dict in res_data.items(): for step, op_dict in step_dict.items(): split_matrix_data() return result diff --git a/profiler/cluster_analyse/common_func/db_manager.py b/profiler/cluster_analyse/common_func/db_manager.py index 85bcf975d..1411acd35 100644 --- a/profiler/cluster_analyse/common_func/db_manager.py +++ b/profiler/cluster_analyse/common_func/db_manager.py @@ -15,12 +15,14 @@ import os import sqlite3 +import logging -from profiler.prof_common.constant import Constant from common_func.empty_class import EmptyClass -from profiler.prof_common.file_manager import check_db_path_valid from common_func.tables_config import TablesConfig +from profiler.prof_common.constant import Constant +from profiler.prof_common.file_manager import check_db_path_valid + class DBManager: """ class to manage DB operation @@ -38,7 +40,7 @@ class DBManager: try: conn = sqlite3.connect(db_path) except sqlite3.Error as err: - print(f"[ERROR] {err}") + logging.error(err) return EmptyClass("empty conn"), EmptyClass("empty curs") try: if isinstance(conn, sqlite3.Connection): @@ -46,7 +48,7 @@ class DBManager: os.chmod(db_path, Constant.FILE_AUTHORITY) return conn, curs except sqlite3.Error as err: - print(f"[ERROR] {err}") + logging.error(err) return EmptyClass("empty conn"), EmptyClass("empty curs") return EmptyClass("empty conn"), EmptyClass("empty curs") @@ -59,12 +61,12 @@ class DBManager: if isinstance(curs, sqlite3.Cursor): curs.close() except sqlite3.Error as err: - print(f"[ERROR] {err}") + logging.error(err) try: if isinstance(conn, sqlite3.Connection): conn.close() except sqlite3.Error as err: - print(f"[ERROR] {err}") + logging.error(err) @staticmethod def judge_table_exists(curs: any, table_name: str) -> any: @@ -77,7 +79,7 @@ class DBManager: curs.execute("select count(*) from sqlite_master where type='table' and name=?", (table_name,)) return curs.fetchone()[0] except sqlite3.Error as err: - print("[ERROR] {}".format(err)) + logging.error(err) return False @staticmethod @@ -138,7 +140,7 @@ class DBManager: curs.execute(sql) res = curs.fetchone()[0] except sqlite3.Error as err: - print("[ERROR] {}".format(err)) + logging.error(err) finally: cls.destroy_db_connect(conn, curs) return res @@ -157,9 +159,9 @@ class DBManager: conn.commit() return True except sqlite3.Error as err: - print(f"[ERROR] {err}") + logging.error(err) return False - print("[ERROR] conn is invalid param") + logging.error("conn is invalid param") return False @staticmethod @@ -173,9 +175,9 @@ class DBManager: conn.commit() return True except sqlite3.Error as err: - print(f"[ERROR] {err}") + logging.error(err) return False - print("[ERROR] conn is invalid param") + logging.error("conn is invalid param") return False @classmethod @@ -192,7 +194,7 @@ class DBManager: else: res = curs.execute(sql) except sqlite3.Error as err: - print(f"[ERROR] {err}") + logging.error(err) curs.row_factory = None return [] try: @@ -204,12 +206,12 @@ class DBManager: else: data += res if len(data) > cls.MAX_ROW_COUNT: - print("[WARRING] The records count in the table exceeds the limit!") + logging.warning("The records count in the table exceeds the limit!") if len(res) < cls.FETCH_SIZE: break return data except sqlite3.Error as err: - print(f"[ERROR] {err}") + logging.error(err) return [] finally: curs.row_factory = None diff --git a/profiler/cluster_analyse/common_func/empty_class.py b/profiler/cluster_analyse/common_func/empty_class.py index df100d156..3bc5751ff 100644 --- a/profiler/cluster_analyse/common_func/empty_class.py +++ b/profiler/cluster_analyse/common_func/empty_class.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. class EmptyClass: def __init__(self: any, info: str = "") -> None: diff --git a/profiler/cluster_analyse/common_func/table_constant.py b/profiler/cluster_analyse/common_func/table_constant.py index de6d47e97..8a1355836 100644 --- a/profiler/cluster_analyse/common_func/table_constant.py +++ b/profiler/cluster_analyse/common_func/table_constant.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. class TableConstant: RANK_SET = "rank_set" diff --git a/profiler/cluster_analyse/common_func/tables_config.py b/profiler/cluster_analyse/common_func/tables_config.py index d8f49916a..e9868737d 100644 --- a/profiler/cluster_analyse/common_func/tables_config.py +++ b/profiler/cluster_analyse/common_func/tables_config.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. class TablesConfig: DATA = { "ClusterCommAnalyzerTimeMap": [ diff --git a/profiler/cluster_analyse/communication_group/base_communication_group.py b/profiler/cluster_analyse/communication_group/base_communication_group.py index 2c44d0ae0..59c6b9c3e 100644 --- a/profiler/cluster_analyse/communication_group/base_communication_group.py +++ b/profiler/cluster_analyse/communication_group/base_communication_group.py @@ -20,8 +20,8 @@ from copy import deepcopy from multiprocessing import Pool import logging -from profiler.prof_common.constant import Constant from cluster_utils.data_transfer_adapter import DataTransferAdapter +from profiler.prof_common.constant import Constant logger = logging.getLogger() @@ -80,7 +80,7 @@ class BaseCommunicationGroup: def generate_p2p_communication_group(self): stage_group = {} - for group_name, rank_set in self.collective_group_dict.items(): + for _, rank_set in self.collective_group_dict.items(): if not self.whether_valid_comm_group(rank_set): continue unioned_set = set() diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py index eef617c9e..8a4d24b97 100644 --- a/profiler/cluster_analyse/communication_group/communication_db_group.py +++ b/profiler/cluster_analyse/communication_group/communication_db_group.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" -# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,15 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" import os import logging from common_func.db_manager import DBManager -from profiler.prof_common.constant import Constant from communication_group.base_communication_group import BaseCommunicationGroup - +from profiler.prof_common.constant import Constant logger = logging.getLogger() diff --git a/profiler/cluster_analyse/communication_group/communication_group_generator.py b/profiler/cluster_analyse/communication_group/communication_group_generator.py index 324137207..51ddb6a2e 100644 --- a/profiler/cluster_analyse/communication_group/communication_group_generator.py +++ b/profiler/cluster_analyse/communication_group/communication_group_generator.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from profiler.prof_common.constant import Constant from communication_group.communication_db_group import CommunicationDBGroup from communication_group.communication_db_group import CommunicationDBGroupOptimized from communication_group.communication_json_group import CommunicationJsonGroup +from profiler.prof_common.constant import Constant SIMPLIFIED = "SIMPLIFIED" diff --git a/profiler/cluster_analyse/communication_group/communication_json_group.py b/profiler/cluster_analyse/communication_group/communication_json_group.py index d9c1c0b40..1cd770f75 100644 --- a/profiler/cluster_analyse/communication_group/communication_json_group.py +++ b/profiler/cluster_analyse/communication_group/communication_json_group.py @@ -15,8 +15,8 @@ import os -from profiler.prof_common.file_manager import FileManager from communication_group.base_communication_group import BaseCommunicationGroup +from profiler.prof_common.file_manager import FileManager class CommunicationJsonGroup(BaseCommunicationGroup): diff --git a/profiler/cluster_analyse/prof_bean/step_trace_time_bean.py b/profiler/cluster_analyse/prof_bean/step_trace_time_bean.py index 3cf4bc4fa..226b1968e 100644 --- a/profiler/cluster_analyse/prof_bean/step_trace_time_bean.py +++ b/profiler/cluster_analyse/prof_bean/step_trace_time_bean.py @@ -18,6 +18,7 @@ import logging logger = logging.getLogger() + class StepTraceTimeBean: STEP = "Step" COMPLEMENT_HEADER = ["Step", "Type", "Index"] diff --git a/profiler/example/mstx_torch_plugin/__init__.py b/profiler/example/mstx_torch_plugin/__init__.py index 0ebc8ae1a..b405266d6 100644 --- a/profiler/example/mstx_torch_plugin/__init__.py +++ b/profiler/example/mstx_torch_plugin/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import sys import logging from .mstx_torch_plugin import apply_mstx_patch diff --git a/profiler/example/mstx_torch_plugin/mstx_torch_plugin.py b/profiler/example/mstx_torch_plugin/mstx_torch_plugin.py index 6f731a407..cd8145fc2 100644 --- a/profiler/example/mstx_torch_plugin/mstx_torch_plugin.py +++ b/profiler/example/mstx_torch_plugin/mstx_torch_plugin.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import functools import torch @@ -115,7 +129,7 @@ def _step_hook(self, *args, **kwargs): if id(self) != mstx_state.last_optimizer_id: return stream = torch.npu.current_stream() - mstx_state.step_id+=1 + mstx_state.step_id += 1 if mstx_state.step_range_id is not None: torch.npu.mstx.range_end(mstx_state.step_range_id) mstx_state.step_range_id = torch.npu.mstx.range_start(f"step {mstx_state.step_id}", stream) diff --git a/profiler/example/setup.py b/profiler/example/setup.py index faf7f497d..3b150ad16 100644 --- a/profiler/example/setup.py +++ b/profiler/example/setup.py @@ -1,5 +1,19 @@ #!/usr/bin/python # -*- coding: utf-8 -*- +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os from setuptools import setup, find_packages diff --git a/profiler/prof_common/__init__.py b/profiler/prof_common/__init__.py index 9ee8a1b9b..577e0e8e1 100644 --- a/profiler/prof_common/__init__.py +++ b/profiler/prof_common/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) \ No newline at end of file diff --git a/profiler/prof_common/file_manager.py b/profiler/prof_common/file_manager.py index 467926dd9..b0dfd81b7 100644 --- a/profiler/prof_common/file_manager.py +++ b/profiler/prof_common/file_manager.py @@ -39,7 +39,7 @@ class FileManager: f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") if check_msg.lower() != "y": logger.warning("The user choose not to read the file: %s", file_path) - return [] + return {} try: with open(file_path, "r") as json_file: result_data = json.loads(json_file.read()) @@ -59,7 +59,7 @@ class FileManager: check_msg = input( f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") if check_msg.lower() != "y": - print(f"[WARNING] The user choose not to read the file: {file_path}") + logger.warning(f"The user choose not to read the file: {file_path}") return [] result_data = [] try: diff --git a/profiler/prof_common/utils.py b/profiler/prof_common/utils.py index 9cef03d33..664a64d32 100644 --- a/profiler/prof_common/utils.py +++ b/profiler/prof_common/utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import configparser import logging import os diff --git a/profiler/setup.py b/profiler/setup.py index 7e5516ef2..124c5ed5b 100644 --- a/profiler/setup.py +++ b/profiler/setup.py @@ -1,5 +1,21 @@ #!/usr/bin/python # -*- coding: utf-8 -*- + +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import sys -- Gitee