From 2e16e9056a66ca687cad498f2a87dbc0a9c113dd Mon Sep 17 00:00:00 2001 From: avocadovo Date: Fri, 21 Feb 2025 10:30:58 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E9=9C=80=E6=B1=82=E3=80=91=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0CPU=20NPU=E7=8E=AF=E5=A2=83=E4=BF=A1=E6=81=AF=E9=87=87?= =?UTF-8?q?=E9=9B=86=E5=92=8C=E5=88=86=E6=9E=90=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../precheck/env_check/cpu_check.py | 100 +++++++++++++++++- .../precheck/env_check/matmul_shape.yaml | 22 ++++ .../precheck/env_check/npu_check.py | 97 ++++++++++++++++- .../precheck/env_check/time_analyze.py | 46 ++++++++ 4 files changed, 263 insertions(+), 2 deletions(-) create mode 100644 profiler/msprof_analyze/precheck/env_check/matmul_shape.yaml create mode 100644 profiler/msprof_analyze/precheck/env_check/time_analyze.py diff --git a/profiler/msprof_analyze/precheck/env_check/cpu_check.py b/profiler/msprof_analyze/precheck/env_check/cpu_check.py index 82e9e10484..051bb68a75 100644 --- a/profiler/msprof_analyze/precheck/env_check/cpu_check.py +++ b/profiler/msprof_analyze/precheck/env_check/cpu_check.py @@ -12,7 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import json +import logging + +import yaml + +import torch +import torch_npu + +from time_analyze import TimeAnalyze from msprof_analyze.precheck.env_check.environment_check import HardwareCheck +from msprof_analyze.precheck.env_check.utils.file import File, FileOpen, FdOpen +from msprof_analyze.precheck.distributed_cluster.distributed_cluster_base import DistributedClusterBase class CPUCheck(HardwareCheck): @@ -20,6 +32,92 @@ class CPUCheck(HardwareCheck): def __init__(self, args): super().__init__(args) + self.dist = DistributedClusterBase() + self.rank = self.dist.rank + self.node_rank = self.dist.node_rank + self.no_shared_storage = args.no_shared_storage + self.output_path = args.output + self.output = dict() + + def cpu_matmul(self, cpu_id): + # 读取矩阵参数 + with FileOpen('./matmul_shape.yaml', 'r', encoding='utf-8') as f: + shape_dict = yaml.safe_load(f.read()) + batch_size = shape_dict['batch_size'] + seq_len = shape_dict['seq_len'] + hidden_size = shape_dict['hidden_size'] + intermediate_size = shape_dict['intermediate_size'] + + # 执行多次矩阵运算:mat_c + mat_a × mat_b + for _ in range(10): + mat_a = torch.randn(batch_size, seq_len, hidden_size).to(f'cpu:{cpu_id}') + mat_b = torch.randn(batch_size, hidden_size, intermediate_size).to(f'cpu:{cpu_id}') + mat_c = torch.randn(seq_len, intermediate_size).to(f'cpu:{cpu_id}') + torch.addbmm(mat_c, mat_a, mat_b) def check(self): - pass + cpu_ids = os.cpu_count() + torch.set_num_threads(cpu_ids) + + for cpu_id in range(cpu_ids): + # 创建事件对象,用于记录运算的开始和结束时间 + start_event = torch_npu.npu.Event(enable_timing=True) + end_event = torch_npu.npu.Event(enable_timing=True) + + start_event.record() + self.cpu_matmul(cpu_id) + end_event.record() + + # 同步当前流,确保全部运算均已完成 + torch_npu.npu.current_stream().synchronize() + cpu_time = start_event.elapsed_time(end_event) + self.output[cpu_id] = cpu_time + + # 数据落盘 + data_path = os.path.join(self.output_path, f"data/cpucheck") + json_path = os.path.join(data_path, f"cpucheck_{self.node_rank}.json") + os.makedirs(os.path.dirname(json_path), exist_ok=True) + with FdOpen(json_path, 'w') as json_file: + json.dump(self.output, json_file, ensure_ascii=False, indent=4) + + # 数据收集 + if self.no_shared_storage: + self.dist.collect_global_info(data_path, data_path) + + if self.rank == 0: + # 数据处理 + time_all = {} + for filename in os.listdir(data_path): + if filename.endswith(".json"): + filepath = os.path.join(data_path, filename) + with FileOpen(filepath, 'r') as f: + data = json.load(f) + time_all.update(data) + + # 分析输出 + logger = logging.getLogger(f"cpucheck") + logger.setLevel(logging.INFO) + handler = logging.StreamHandler() + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + cpu_analyze = TimeAnalyze(time_all) + slow_rank, slow_time, max_ratio, isproblem = cpu_analyze.time_analyze() + + if isproblem: + logger.info(f"The CPU:{slow_rank} may have performance issues," \ + f"its calculation time is {slow_time}ms," \ + f"the relative difference from the average calculation time is {round(max_ratio * 100)}%." \ + f"It is recommended to check and optimize the CPU performance on the server.") + + else: + logger.info(f"CPUs are working well, no performance issues found.") + + logger.removeHandler(handler) + + +if __name__ == "__main__": + cpu_check = CPUCheck() + cpu_check.check() \ No newline at end of file diff --git a/profiler/msprof_analyze/precheck/env_check/matmul_shape.yaml b/profiler/msprof_analyze/precheck/env_check/matmul_shape.yaml new file mode 100644 index 0000000000..690b2c2e1b --- /dev/null +++ b/profiler/msprof_analyze/precheck/env_check/matmul_shape.yaml @@ -0,0 +1,22 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# matmul shape 参数 + +batch_size: 1 +seq_len: 4096 +hidden_size: 8192 +intermediate_size: 3584 \ No newline at end of file diff --git a/profiler/msprof_analyze/precheck/env_check/npu_check.py b/profiler/msprof_analyze/precheck/env_check/npu_check.py index b08ab3f83e..3ef5b5eead 100644 --- a/profiler/msprof_analyze/precheck/env_check/npu_check.py +++ b/profiler/msprof_analyze/precheck/env_check/npu_check.py @@ -12,7 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import json +import logging + +import yaml + +import torch +import torch_npu + +from time_analyze import TimeAnalyze from msprof_analyze.precheck.env_check.environment_check import HardwareCheck +from msprof_analyze.precheck.env_check.utils.file import File, FileOpen, FdOpen +from msprof_analyze.precheck.distributed_cluster.distributed_cluster_base import DistributedClusterBase class NPUCheck(HardwareCheck): @@ -20,6 +32,89 @@ class NPUCheck(HardwareCheck): def __init__(self, args): super().__init__(args) + self.dist = DistributedClusterBase() + self.rank = self.dist.rank + self.local_rank = self.dist.local_rank + self.nproc_per_node = self.dist.local_world_size + self.no_shared_storage = args.no_shared_storage + self.output_path = args.output + self.output = dict() + + def npu_matmul(self, npu_id): + # 读取矩阵参数 + with FileOpen('./matmul_shape.yaml', 'r', encoding='utf-8') as f: + shape_dict = yaml.safe_load(f.read()) + batch_size = shape_dict['batch_size'] + seq_len = shape_dict['seq_len'] + hidden_size = shape_dict['hidden_size'] + intermediate_size = shape_dict['intermediate_size'] + + # 执行多次矩阵运算:mat_c + mat_a × mat_b + for _ in range(10): + mat_a = torch.randn(batch_size, seq_len, hidden_size).to(f'npu:{npu_id}') + mat_b = torch.randn(batch_size, hidden_size, intermediate_size).to(f'npu:{npu_id}') + mat_c = torch.randn(seq_len, intermediate_size).to(f'npu:{npu_id}') + torch.addbmm(mat_c, mat_a, mat_b) def check(self): - pass + # 创建事件对象,用于记录运算的开始和结束时间 + start_event = torch_npu.npu.Event(enable_timing=True) + end_event = torch_npu.npu.Event(enable_timing=True) + + start_event.record() + self.npu_matmul(self.local_rank) + end_event.record() + + # 同步当前流,确保全部运算均已完成 + torch_npu.npu.current_stream().synchronize() + npu_time = start_event.elapsed_time(end_event) + self.output[self.rank] = npu_time + + # 数据落盘 + data_path = os.path.join(self.output_path, f"data/npucheck") + json_path = os.path.join(data_path, f"npucheck_{self.rank}.json") + os.makedirs(os.path.dirname(json_path), exist_ok=True) + with FdOpen(json_path, 'w') as json_file: + json.dump(self.output, json_file, ensure_ascii=False, indent=4) + + # 数据收集 + if self.no_shared_storage: + self.dist.collect_global_info(data_path, data_path) + + if self.rank == 0: + # 数据处理 + time_all = {} + for filename in os.listdir(data_path): + if filename.endswith(".json"): + filepath = os.path.join(data_path, filename) + with FileOpen(filepath, 'r') as f: + data = json.load(f) + time_all.update(data) + + # 分析输出 + logger = logging.getLogger(f"npucheck") + logger.setLevel(logging.INFO) + handler = logging.StreamHandler() + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + npu_analyze = TimeAnalyze(time_all) + slow_rank, slow_time, max_ratio, isproblem = npu_analyze.time_analyze() + + if isproblem: + logger.info(f"The NPU:{slow_rank} may have performance issues," \ + f"its calculation time is {slow_time}ms," \ + f"the relative difference from the average calculation time is {round(max_ratio * 100)}%." \ + f"It is recommended to check and optimize the NPU performance on the server.") + + else: + logger.info(f"NPUs are working well, no performance issues found.") + + logger.removeHandler(handler) + + +if __name__ == "__main__": + npu_check = NPUCheck() + npu_check.check() \ No newline at end of file diff --git a/profiler/msprof_analyze/precheck/env_check/time_analyze.py b/profiler/msprof_analyze/precheck/env_check/time_analyze.py new file mode 100644 index 0000000000..64776900f4 --- /dev/null +++ b/profiler/msprof_analyze/precheck/env_check/time_analyze.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class TimeAnalyze(): + def __init__(self, time): + self.time = time + + def time_analyze(self): + if not self.time: + return "Error: self.time is undefined." + + slow_rank = None + slow_time = None + mean_time = 0 + max_ratio = 0 + + time = list(self.time.values()) + # 耗时极值编号和数据 + slow_time = max(time) + slow_rank = max(self.time, key=self.time.get) + + # 计算快慢差异 + mean_time = sum(time) / len(time) + max_ratio = (slow_time - mean_time) / mean_time + + # 判断是否存在问题 + ratio_threshold = 0.05 + if max_ratio > ratio_threshold: + isproblem = True + else: + isproblem = False + + return slow_rank, slow_time, max_ratio, isproblem \ No newline at end of file -- Gitee