diff --git a/profiler/msprof_analyze/precheck/distributed_cluster/distributed_cluster_base.py b/profiler/msprof_analyze/precheck/distributed_cluster/distributed_cluster_base.py index 4e73dd3f4454967aec2a8275e71fa417a5a13e48..8fd329b1b436e74f35ac44b98c6a09af6cc7fcb6 100644 --- a/profiler/msprof_analyze/precheck/distributed_cluster/distributed_cluster_base.py +++ b/profiler/msprof_analyze/precheck/distributed_cluster/distributed_cluster_base.py @@ -396,7 +396,7 @@ class DistributedClusterBase: "node_rank": self.node_rank, "master_addr": self.master_addr, "master_port": self.master_port, - "master_rank_num": self.master_rank_num, + "master_rank_num": self.local_world_size, "split_file_size": split_file_size, "time_out": time_out, "log_file": log_file diff --git a/profiler/msprof_analyze/precheck/env_check/environment_variable_check.py b/profiler/msprof_analyze/precheck/env_check/environment_variable_check.py index 549eb9ddf075c47284807779badbeb85cd1eca18..0fadfe29457ea1507f8f68b593d8142b7be35291 100644 --- a/profiler/msprof_analyze/precheck/env_check/environment_variable_check.py +++ b/profiler/msprof_analyze/precheck/env_check/environment_variable_check.py @@ -12,7 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import glob +import os +import logging +import time + +import yaml + +from msprof_analyze.precheck.distributed_cluster.distributed_cluster_base import DistributedClusterBase from msprof_analyze.precheck.env_check.environment_check import SoftwareCheck +from msprof_analyze.precheck.env_check.utils.constant import Constant +from msprof_analyze.precheck.env_check.utils.file import File, FileOpen, FdOpen class EnvironmentVariableCheck(SoftwareCheck): @@ -20,6 +30,83 @@ class EnvironmentVariableCheck(SoftwareCheck): def __init__(self, args): super().__init__(args) + self.output_path = args.output + self.nproc_per_node = args.nproc_per_node + self.world_size = args.world_size + self.nnodes = args.world_size / self.nproc_per_node + self.rank = args.rank + self.node_rank = args.node_rank + self.local_rank = args.local_rank + self.no_shared_storage = args.no_shared_storage + if self.local_rank == 0: + self.data_file_path = os.path.join(self.output_path, f"data/{self.CHECK_TYPE}_{self.node_rank}.txt") + File.create_file(self.data_file_path) + self.logger = logging.getLogger(f"{self.CHECK_TYPE}_{self.node_rank}") + def check(self): - pass + if self.local_rank == 0: + target_variables = self.__get_target_variables() + env_variables = self.__get_env_variables() + mismatched = self.__check_env_variables(target_variables, env_variables) + self.__log_result(mismatched) + + def __get_target_variables(self): + current_directory = os.path.dirname(os.path.abspath(__file__)) + variables_file_path = os.path.join(current_directory, Constant.TARGET_VARIABLE_FILE_PATH) + res = {} + with FileOpen(variables_file_path) as file: + if not file.file_reader: + return {} + try: + res = yaml.safe_load(file.file_reader) + except yaml.YAMLError as e: + self.logger.error(f"Error when loading target variables: {e}") + return res + + def __get_env_variables(self): + res = {key: os.getenv(key) for key in os.environ} + return res + + def __check_env_variables(self, target_variables, env_variables): + mismatched = [] + + for target_key, target_value in target_variables.items(): + env_value = env_variables.get(target_key, "") + if env_value != target_value: + mismatched.append(f"Environment variables '{target_key}' value mismatch, " + f"expected value: '{target_value}', got value: '{env_value}'.") + + return mismatched + + def __log_result(self, mismatched): + with FdOpen(self.data_file_path) as file: + file.write(f"NODE_RANK {self.node_rank} checking environment variables result:\n") + if mismatched: + file.write(f"Missing or different environment variables:\n") + for item in mismatched: + file.write(item + "\n") + else: + file.write(f"All required environment variables are present and have the correct values.\n") + self.logger.info(f"NODE_RANK {self.node_rank} finished checking environment variables.") + + # collect and print global info + env = DistributedClusterBase() + if self.no_shared_storage: + env.collect_global_info(self.data_file_path, self.data_file_path) + + if self.rank == 0: + file_path_list = glob.glob(os.path.join(self.output_path, f"data/{self.CHECK_TYPE}_*.txt")) + while len(file_path_list) < self.nnodes: + self.logger.info(f"Waiting for other nodes to finish checking Python libraries.") + file_path_list = glob.glob(os.path.join(self.output_path, f"data/{self.CHECK_TYPE}_*.txt")) + time.sleep(10) + + for file_path in file_path_list: + with FileOpen(file_path, "r") as file: + self.logger.warning(file.file_reader.read()) + + +if __name__ == "__main__": + checker = EnvironmentVariableCheck() + checker.check() diff --git a/profiler/msprof_analyze/precheck/env_check/python_library_check.py b/profiler/msprof_analyze/precheck/env_check/python_library_check.py index a23b674ac05fd56cab7ab285f149f1d6cc5e1821..b17ceb1417761f24ff9861f4873ec55f5bed9a9b 100644 --- a/profiler/msprof_analyze/precheck/env_check/python_library_check.py +++ b/profiler/msprof_analyze/precheck/env_check/python_library_check.py @@ -12,7 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import glob +import os +import logging +import importlib.metadata +import time +import pkg_resources + +from msprof_analyze.precheck.distributed_cluster.distributed_cluster_base import DistributedClusterBase from msprof_analyze.precheck.env_check.environment_check import SoftwareCheck +from msprof_analyze.precheck.env_check.utils.constant import Constant +from msprof_analyze.precheck.env_check.utils.file import File, FileOpen, FdOpen class PythonLibraryCheck(SoftwareCheck): @@ -20,6 +30,93 @@ class PythonLibraryCheck(SoftwareCheck): def __init__(self, args): super().__init__(args) + self.output_path = args.output + self.nproc_per_node = args.nproc_per_node + self.world_size = args.world_size + self.nnodes = args.world_size / self.nproc_per_node + self.rank = args.rank + self.node_rank = args.node_rank + self.local_rank = args.local_rank + self.no_shared_storage = args.no_shared_storage + + if self.local_rank == 0: + self.data_file_path = os.path.join(self.output_path, f"data/{self.CHECK_TYPE}_{self.node_rank}.txt") + File.create_file(self.data_file_path) + self.logger = logging.getLogger(f"{self.CHECK_TYPE}_{self.rank}") def check(self): - pass + if self.local_rank == 0: + target_libraries = self.__get_target_libraries() + env_libraries = self.__get_env_libraries() + missing_or_different = self.__check_env_libraries(target_libraries, env_libraries) + self.__log_result(missing_or_different) + + def __get_target_libraries(self): + current_directory = os.path.dirname(os.path.abspath(__file__)) + requirements_file_path = os.path.join(current_directory, Constant.TARGET_LIBRARY_FILE_PATH) + target_libs = {} + with FileOpen(requirements_file_path) as file: + if not file.file_reader: + return {} + for line in file.file_reader: + line = line.strip() + if line and not line.startswith('#'): + try: + lib = pkg_resources.Requirement.parse(line) + target_libs[lib.project_name] = lib.specifier + except ValueError: + self.logger.warning(f"Skipping invalid line: {line}") + return target_libs + + def __get_env_libraries(self): + env_libs = {} + + for package in importlib.metadata.distributions(): + env_libs[package.metadata['Name']] = package.version + return env_libs + + + def __check_env_libraries(self, target_libraries, env_libraries): + missing_or_mismatched = [] + + for lib_name, specifier in target_libraries.items(): + if lib_name in env_libraries: + installed_version = env_libraries[lib_name] + if specifier and not specifier.contains(installed_version): + missing_or_mismatched.append(f"Third-party libraries '{lib_name}' version mismatch, " + f"expected '{specifier}', got '{installed_version}'.") + else: + missing_or_mismatched.append(f"Third-party libraries '{lib_name}' missing, " + f"expected version: '{specifier}'.") + return missing_or_mismatched + + def __log_result(self, missing_or_mismatched): + with FdOpen(self.data_file_path) as file: + file.write(f"NODE_RANK {self.node_rank} checking Python libraries result:\n") + if missing_or_mismatched: + file.write(f"Missing or version-mismatched Python libraries:\n") + for item in missing_or_mismatched: + file.write(item + "\n") + else: + file.write(f"All required Python libraries are present and have the correct versions.\n") + self.logger.info(f"NODE_RANK {self.node_rank} finished checking Python libraries.") + + # collect and print global info + env = DistributedClusterBase() + if self.no_shared_storage: + env.collect_global_info(self.data_file_path, self.data_file_path) + + if self.rank == 0: + file_path_list = glob.glob(os.path.join(self.output_path, f"data/{self.CHECK_TYPE}_*.txt")) + while len(file_path_list) < self.nnodes: + self.logger.info(f"Waiting for other nodes to finish checking Python libraries.") + file_path_list = glob.glob(os.path.join(self.output_path, f"data/{self.CHECK_TYPE}_*.txt")) + time.sleep(10) + + for file_path in file_path_list: + with FileOpen(file_path, "r") as file: + self.logger.warning(file.file_reader.read()) + +if __name__ == '__main__': + python_library_check = PythonLibraryCheck() + python_library_check.check() diff --git a/profiler/msprof_analyze/precheck/env_check/target_config/__init__.py b/profiler/msprof_analyze/precheck/env_check/target_config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b14094e3f9a77a0970342980ed8de1017f58ce19 --- /dev/null +++ b/profiler/msprof_analyze/precheck/env_check/target_config/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/profiler/msprof_analyze/precheck/env_check/target_config/target_env_variables.yaml b/profiler/msprof_analyze/precheck/env_check/target_config/target_env_variables.yaml new file mode 100644 index 0000000000000000000000000000000000000000..92e26ce7e56c5b7527f9aa4f29a63bfb2ad7bb30 --- /dev/null +++ b/profiler/msprof_analyze/precheck/env_check/target_config/target_env_variables.yaml @@ -0,0 +1,51 @@ +# 可根据场景修改或添加环境变量目标值 + +# CANN +TE_PARALLEL_COMPILER: "" +# IGNORE_INFER_ERROR: +AUTO_USE_UC_MEMORY: "" +ACLNN_CACHE_LIMIT: "" +ENABLE_DYNAMIC_SHAPE_MULTI_STREAM: "" +MAX_RUNTIME_CORE_NUMBER: "" +HCCL_CONNECT_TIMEOUT: 1200 +HCCL_EXEC_TIMEOUT: 1200 +HCCL_ALGO: "" +HCCL_BUFFSIZE: "" +HCCL_INTRA_PCIE_ENABLE: "" +HCCL_INTRA_ROCE_ENABLE: "" +HCCL_RDMA_TC: "" +HCCL_RDMA_SL: "" +HCCL_RDMA_TIMEOUT: "" +HCCL_RDMA_RETRY_CNT: "" +HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT: "" +HCCL_RDMA_QPS_PER_CONNECTION: "" +HCCL_MULTI_QP_THRESHOLD: "" +HCCL_OP_EXPANSION_MODE: "" +HCCL_DETERMINISTIC: "false" + +# torch_npu +INF_NAN_MODE_ENABLE: "" +COMBINED_ENABLE: 1 +ASCEND_LAUNCH_BLOCKING: 0 +# ACL_OP_COMPILER_CACHE_DIR: +ACL_OP_COMPILER_CACHE_MODE: "enable" +PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" +HCCL_ASYNC_ERROR_HANDLING: "" +HCCL_DESYNC_DEBUG: "" +HCCL_EVENT_TIMEOUT: "" +P2P_HCCL_BUFFSIZE: "" +PYTORCH_NO_NPU_MEMORY_CACHING: "" +OOM_SNAPSHOT_ENABLE: "" +# OOM_SNAPSHOT_PATH: +# RANK_TABLE_FILE: +# TORCH_NPU_DISABLED_WARNING: +TASK_QUEUE_ENABLE: 2 +# ACL_DEVICE_SYNC_TIMEOUT: +MULTI_STREAM_MEMORY_REUSE: 1 +NPU_ASD_ENABLE: 0 +NPU_ASD_UPPER_THRESH: "" +NPU_ASD_SIGMA_THRESH: "" +INF_NAN_MODE_FORCE_DISABLE: "" + +# Other +CUDA_DEVICE_MAX_CONNECTIONS: 1 \ No newline at end of file diff --git a/profiler/msprof_analyze/precheck/env_check/target_config/target_requirements.txt b/profiler/msprof_analyze/precheck/env_check/target_config/target_requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa8b6a6af382b52183a72ea58b87f27fc28ae437 --- /dev/null +++ b/profiler/msprof_analyze/precheck/env_check/target_config/target_requirements.txt @@ -0,0 +1,52 @@ +# 可根据场景修改或添加三方库目标值 + +# basic +torch==2.1.0 +torch-npu==2.1.0.post8 +torchvision==0.16.0 +apex==0.1+ascend + +# mindspeed-llm +numpy>=1.19.2,<2.0.0 +transformers==4.43.2 +transformers_stream_generator +sympy +decorator +scipy +sentencepiece +einops +datasets>=2.16.0 +pybind11 +accelerate +six +protobuf +peft==0.7.1 +tiktoken +ray==2.10.0 +tensordict==0.1.2 +hydra-core==1.3.2 +codetiming +bitsandbytes-npu-beta==0.45.2 + +# mindspeed +pybind11 +ninja +wheel +numpy +six +regex +decorator +attrs +psutil +pyyaml +protobuf +einops +scipy +sentencepiece +pytest +tokenizers<=0.20.3 +transformers>=4.43.2 +gpytorch +pandas +scikit-learn +SQLAlchemy \ No newline at end of file diff --git a/profiler/msprof_analyze/precheck/env_check/utils/__init__.py b/profiler/msprof_analyze/precheck/env_check/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b14094e3f9a77a0970342980ed8de1017f58ce19 --- /dev/null +++ b/profiler/msprof_analyze/precheck/env_check/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/profiler/msprof_analyze/precheck/env_check/utils/constant.py b/profiler/msprof_analyze/precheck/env_check/utils/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..93526e403d9f4543b11d477d3860c2c6c34a4656 --- /dev/null +++ b/profiler/msprof_analyze/precheck/env_check/utils/constant.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class Constant(object): + # Max path size + MAX_PATH_SIZE = 255 + + # Input file size limit 64 KB + MAX_FILE_BYTES = 64 * 1024 + + # Athority of directory and file + DIR_AUTHORITY = 0o750 + File_AUTHORITY = 0o640 + + # Target config file path + TARGET_VARIABLE_FILE_PATH = "./target_config/target_env_variables.yaml" + TARGET_LIBRARY_FILE_PATH = './target_config/target_requirements.txt' diff --git a/profiler/msprof_analyze/precheck/env_check/utils/file.py b/profiler/msprof_analyze/precheck/env_check/utils/file.py new file mode 100644 index 0000000000000000000000000000000000000000..eeef193512d2f53dd3107d3f1a8eddad3af95d83 --- /dev/null +++ b/profiler/msprof_analyze/precheck/env_check/utils/file.py @@ -0,0 +1,152 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import stat +import logging + +from msprof_analyze.precheck.env_check.utils.constant import Constant + +logger = logging.getLogger() + + +class File: + """ + open的安全文件操作类,使用with语句进行上下文管理 + """ + @staticmethod + def check(path: str, max_size: int = Constant.MAX_FILE_BYTES) -> bool: + """ + 检查给定的文件路径是否有效。 + """ + if not path: + logger.error("The path is empty. Please enter a valid path.") + return False + if len(path) > Constant.MAX_PATH_SIZE: + logger.error(f"The length of file path is large than {Constant.MAX_PATH_SIZE}. Please check the path.") + return False + if os.path.getsize(path) > max_size: + logger.error(f"The path \"{path}\" is too large to read. Please check the path.") + return False + if os.path.islink(path): + logger.error(f"The path \"{path}\" is link. Please check the path.") + return False + return True + + @staticmethod + def check_dir_for_create_file(file_dir): + """ + 创建文件需要目录有w和x权限,否则无法创建文件 + """ + if not os.access(file_dir, os.W_OK | os.X_OK): + logger.error(f"The path \"{file_dir}\" does not have permission to create file. ") + return False + return True + + + @staticmethod + def create_file(file_path): + """ + 根据提供的路径创建一个文件,覆盖原有文件。如果目录不存在,则先创建目录。 + """ + directory = os.path.dirname(file_path) + if not os.path.exists(directory): + os.makedirs(directory, mode=Constant.DIR_AUTHORITY) + try: + with open(file_path, 'w') as file: + os.chmod(file_path, Constant.File_AUTHORITY) + except Exception as e: + logger.error(f"Failed to create file: {e}") + + +class FileOpen: + """ + 读取文件内容的类,使用with语句进行上下文管理 + """ + def __init__(self, path: str, mode: str = "r", max_size: int = Constant.MAX_FILE_BYTES): + self.path = path + self.mode = mode + self.max_size = max_size + self.file_reader = None + + def __enter__(self): + if not self.check(self.path): + logger.error(f"Cannot access the file: {self.path}") + return None + self.file_reader = open(self.path, self.mode) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.file_reader: + self.file_reader.close() + + @staticmethod + def check(file_path, max_size: int = Constant.MAX_FILE_BYTES): + if not File.check(file_path, max_size): + logger.error(f"FileReader check failed: {file_path}") + return False + if not os.path.isfile(file_path): + logger.error(f"The read path \"{file_path}\" is not a file.") + return False + if not os.access(file_path, os.R_OK): + logger.error(f"The path \"{file_path}\" does not have permission to read. ") + return False + return True + + +class FdOpen: + """ + 新建和写入文件内容的类,使用with语句进行上下文管理 + """ + def __init__(self, path: str, mode: str = "w", permission: int = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP, + flags: int = os.O_WRONLY | os.O_CREAT | os.O_TRUNC, newline: str = None) -> None: + self.path = path + self.mode = mode + self.permission = permission + self.flags = flags + self.newline = newline + self.fd = None + self.file_open = None + + def __enter__(self): + if not self.check(self.path): + logger.error(f"Cannot access the file: {self.path}") + return None + self.fd = os.open(self.path, self.flags, self.permission) + if self.newline is None: + self.file_open = os.fdopen(self.fd, self.mode) + else: + self.file_open = os.fdopen(self.fd, self.mode, newline=self.newline) + return self.file_open + + def __exit__(self, exc_type, exc_value, traceback): + if self.file_open: + self.file_open.close() + elif self.fd: + os.close(self.fd) + + @staticmethod + def check(file_path, max_size: int = Constant.MAX_FILE_BYTES): + if not os.path.exists(file_path): + return File.check_dir_for_create_file(os.path.dirname(file_path)) + if not File.check(file_path, max_size): + logger.error(f"FileReader check failed: {file_path}") + return False + if not os.path.isfile(file_path): + logger.error(f"The write path \"{file_path}\" is not a file.") + return False + if not os.access(file_path, os.W_OK): + logger.error(f"The path \"{file_path}\" does not have permission to write. ") + return False + return True diff --git a/profiler/msprof_analyze/precheck/precheck.py b/profiler/msprof_analyze/precheck/precheck.py index bb32447d45370eafc391190e2fe7c45afce422c6..5c9b2e24f657d62f959beff17ef959e9d0c742ec 100644 --- a/profiler/msprof_analyze/precheck/precheck.py +++ b/profiler/msprof_analyze/precheck/precheck.py @@ -52,8 +52,8 @@ class Precheck: f"--pipeline-model-parallel-size {pipeline_model_parallel_size} " f"--context-parallel-size {context_parallel_size} " f"--expert-model-parallel-size {expert_model_parallel_size} " - f"--output {output} " - f"--check-type {check_type} " + f"--output '{output}' " + f"--check-type '{check_type}' " + ("--no-shared-storage " if kwargs.get("no_shared_storage", False) else "") ] try: