From 2e16e9056a66ca687cad498f2a87dbc0a9c113dd Mon Sep 17 00:00:00 2001
From: avocadovo <wangjun578@h-partners.com>
Date: Fri, 21 Feb 2025 10:30:58 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90=E9=9C=80=E6=B1=82=E3=80=91=E5=AE=9E?=
 =?UTF-8?q?=E7=8E=B0CPU=20NPU=E7=8E=AF=E5=A2=83=E4=BF=A1=E6=81=AF=E9=87=87?=
 =?UTF-8?q?=E9=9B=86=E5=92=8C=E5=88=86=E6=9E=90=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../precheck/env_check/cpu_check.py           | 100 +++++++++++++++++-
 .../precheck/env_check/matmul_shape.yaml      |  22 ++++
 .../precheck/env_check/npu_check.py           |  97 ++++++++++++++++-
 .../precheck/env_check/time_analyze.py        |  46 ++++++++
 4 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 profiler/msprof_analyze/precheck/env_check/matmul_shape.yaml
 create mode 100644 profiler/msprof_analyze/precheck/env_check/time_analyze.py

diff --git a/profiler/msprof_analyze/precheck/env_check/cpu_check.py b/profiler/msprof_analyze/precheck/env_check/cpu_check.py
index 82e9e10484..051bb68a75 100644
--- a/profiler/msprof_analyze/precheck/env_check/cpu_check.py
+++ b/profiler/msprof_analyze/precheck/env_check/cpu_check.py
@@ -12,7 +12,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+import json
+import logging
+
+import yaml
+
+import torch
+import torch_npu
+
+from time_analyze import TimeAnalyze
 from msprof_analyze.precheck.env_check.environment_check import HardwareCheck
+from msprof_analyze.precheck.env_check.utils.file import File, FileOpen, FdOpen
+from msprof_analyze.precheck.distributed_cluster.distributed_cluster_base import DistributedClusterBase
 
 
 class CPUCheck(HardwareCheck):
@@ -20,6 +32,92 @@ class CPUCheck(HardwareCheck):
 
     def __init__(self, args):
         super().__init__(args)
+        self.dist = DistributedClusterBase()
+        self.rank = self.dist.rank
+        self.node_rank = self.dist.node_rank
+        self.no_shared_storage = args.no_shared_storage
+        self.output_path = args.output
+        self.output = dict()
+
+    def cpu_matmul(self, cpu_id):
+        # 读取矩阵参数
+        with FileOpen('./matmul_shape.yaml', 'r', encoding='utf-8') as f:
+            shape_dict = yaml.safe_load(f.read())
+        batch_size = shape_dict['batch_size']
+        seq_len = shape_dict['seq_len']
+        hidden_size = shape_dict['hidden_size']
+        intermediate_size = shape_dict['intermediate_size']
+
+        # 执行多次矩阵运算：mat_c + mat_a × mat_b
+        for _ in range(10):
+            mat_a = torch.randn(batch_size, seq_len, hidden_size).to(f'cpu:{cpu_id}')
+            mat_b = torch.randn(batch_size, hidden_size, intermediate_size).to(f'cpu:{cpu_id}')
+            mat_c = torch.randn(seq_len, intermediate_size).to(f'cpu:{cpu_id}')
+            torch.addbmm(mat_c, mat_a, mat_b)
 
     def check(self):
-        pass
+        cpu_ids = os.cpu_count()
+        torch.set_num_threads(cpu_ids)
+
+        for cpu_id in range(cpu_ids):
+            # 创建事件对象，用于记录运算的开始和结束时间
+            start_event = torch_npu.npu.Event(enable_timing=True)
+            end_event = torch_npu.npu.Event(enable_timing=True)
+
+            start_event.record()
+            self.cpu_matmul(cpu_id) 
+            end_event.record()
+
+            # 同步当前流，确保全部运算均已完成
+            torch_npu.npu.current_stream().synchronize()
+            cpu_time = start_event.elapsed_time(end_event)
+            self.output[cpu_id] = cpu_time
+
+        # 数据落盘
+        data_path = os.path.join(self.output_path, f"data/cpucheck")
+        json_path = os.path.join(data_path, f"cpucheck_{self.node_rank}.json")
+        os.makedirs(os.path.dirname(json_path), exist_ok=True)
+        with FdOpen(json_path, 'w') as json_file:
+            json.dump(self.output, json_file, ensure_ascii=False, indent=4)
+
+         # 数据收集
+        if self.no_shared_storage:
+            self.dist.collect_global_info(data_path, data_path)
+
+        if self.rank == 0:
+            # 数据处理
+            time_all = {}
+            for filename in os.listdir(data_path):
+                if filename.endswith(".json"):
+                    filepath = os.path.join(data_path, filename)
+                    with FileOpen(filepath, 'r') as f:
+                        data = json.load(f)
+                        time_all.update(data)
+
+            # 分析输出
+            logger = logging.getLogger(f"cpucheck")
+            logger.setLevel(logging.INFO)
+            handler = logging.StreamHandler()
+            handler.setLevel(logging.INFO)
+            formatter = logging.Formatter('%(asctime)s - %(message)s')
+            handler.setFormatter(formatter)
+            logger.addHandler(handler)
+
+            cpu_analyze = TimeAnalyze(time_all)
+            slow_rank, slow_time, max_ratio, isproblem = cpu_analyze.time_analyze()
+
+            if isproblem:
+                logger.info(f"The CPU:{slow_rank} may have performance issues," \
+                            f"its calculation time is {slow_time}ms," \
+                            f"the relative difference from the average calculation time is {round(max_ratio * 100)}%." \
+                            f"It is recommended to check and optimize the CPU performance on the server.")
+                        
+            else:
+                logger.info(f"CPUs are working well, no performance issues found.")
+
+            logger.removeHandler(handler)
+
+
+if __name__ == "__main__":
+    cpu_check = CPUCheck()
+    cpu_check.check()
\ No newline at end of file
diff --git a/profiler/msprof_analyze/precheck/env_check/matmul_shape.yaml b/profiler/msprof_analyze/precheck/env_check/matmul_shape.yaml
new file mode 100644
index 0000000000..690b2c2e1b
--- /dev/null
+++ b/profiler/msprof_analyze/precheck/env_check/matmul_shape.yaml
@@ -0,0 +1,22 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# matmul shape 参数
+
+batch_size: 1
+seq_len: 4096
+hidden_size: 8192
+intermediate_size: 3584
\ No newline at end of file
diff --git a/profiler/msprof_analyze/precheck/env_check/npu_check.py b/profiler/msprof_analyze/precheck/env_check/npu_check.py
index b08ab3f83e..3ef5b5eead 100644
--- a/profiler/msprof_analyze/precheck/env_check/npu_check.py
+++ b/profiler/msprof_analyze/precheck/env_check/npu_check.py
@@ -12,7 +12,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+import json
+import logging
+
+import yaml
+
+import torch
+import torch_npu
+
+from time_analyze import TimeAnalyze
 from msprof_analyze.precheck.env_check.environment_check import HardwareCheck
+from msprof_analyze.precheck.env_check.utils.file import File, FileOpen, FdOpen
+from msprof_analyze.precheck.distributed_cluster.distributed_cluster_base import DistributedClusterBase
 
 
 class NPUCheck(HardwareCheck):
@@ -20,6 +32,89 @@ class NPUCheck(HardwareCheck):
 
     def __init__(self, args):
         super().__init__(args)
+        self.dist = DistributedClusterBase()
+        self.rank = self.dist.rank
+        self.local_rank = self.dist.local_rank
+        self.nproc_per_node = self.dist.local_world_size
+        self.no_shared_storage = args.no_shared_storage
+        self.output_path = args.output
+        self.output = dict()
+
+    def npu_matmul(self, npu_id):
+        # 读取矩阵参数
+        with FileOpen('./matmul_shape.yaml', 'r', encoding='utf-8') as f:
+            shape_dict = yaml.safe_load(f.read())
+        batch_size = shape_dict['batch_size']
+        seq_len = shape_dict['seq_len']
+        hidden_size = shape_dict['hidden_size']
+        intermediate_size = shape_dict['intermediate_size']
+
+        # 执行多次矩阵运算：mat_c + mat_a × mat_b
+        for _ in range(10):
+            mat_a = torch.randn(batch_size, seq_len, hidden_size).to(f'npu:{npu_id}')
+            mat_b = torch.randn(batch_size, hidden_size, intermediate_size).to(f'npu:{npu_id}')
+            mat_c = torch.randn(seq_len, intermediate_size).to(f'npu:{npu_id}')
+            torch.addbmm(mat_c, mat_a, mat_b)
 
     def check(self):
-        pass
+        # 创建事件对象，用于记录运算的开始和结束时间
+        start_event = torch_npu.npu.Event(enable_timing=True)
+        end_event = torch_npu.npu.Event(enable_timing=True)
+
+        start_event.record()
+        self.npu_matmul(self.local_rank) 
+        end_event.record()
+
+        # 同步当前流，确保全部运算均已完成
+        torch_npu.npu.current_stream().synchronize()
+        npu_time = start_event.elapsed_time(end_event)
+        self.output[self.rank] = npu_time
+
+        # 数据落盘
+        data_path = os.path.join(self.output_path, f"data/npucheck")
+        json_path = os.path.join(data_path, f"npucheck_{self.rank}.json")
+        os.makedirs(os.path.dirname(json_path), exist_ok=True)
+        with FdOpen(json_path, 'w') as json_file:
+            json.dump(self.output, json_file, ensure_ascii=False, indent=4)
+
+        # 数据收集
+        if self.no_shared_storage:
+            self.dist.collect_global_info(data_path, data_path)
+
+        if self.rank == 0:
+            # 数据处理
+            time_all = {}
+            for filename in os.listdir(data_path):
+                if filename.endswith(".json"):
+                    filepath = os.path.join(data_path, filename)
+                    with FileOpen(filepath, 'r') as f:
+                        data = json.load(f)
+                        time_all.update(data)
+
+            # 分析输出
+            logger = logging.getLogger(f"npucheck")
+            logger.setLevel(logging.INFO)
+            handler = logging.StreamHandler()
+            handler.setLevel(logging.INFO)
+            formatter = logging.Formatter('%(asctime)s - %(message)s')
+            handler.setFormatter(formatter)
+            logger.addHandler(handler)
+
+            npu_analyze = TimeAnalyze(time_all)
+            slow_rank, slow_time, max_ratio, isproblem = npu_analyze.time_analyze()
+
+            if isproblem:
+                logger.info(f"The NPU:{slow_rank} may have performance issues," \
+                            f"its calculation time is {slow_time}ms," \
+                            f"the relative difference from the average calculation time is {round(max_ratio * 100)}%." \
+                            f"It is recommended to check and optimize the NPU performance on the server.")
+                        
+            else:
+                logger.info(f"NPUs are working well, no performance issues found.")
+
+            logger.removeHandler(handler)
+
+
+if __name__ == "__main__":
+    npu_check = NPUCheck()
+    npu_check.check()
\ No newline at end of file
diff --git a/profiler/msprof_analyze/precheck/env_check/time_analyze.py b/profiler/msprof_analyze/precheck/env_check/time_analyze.py
new file mode 100644
index 0000000000..64776900f4
--- /dev/null
+++ b/profiler/msprof_analyze/precheck/env_check/time_analyze.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class TimeAnalyze():
+    def __init__(self, time):
+        self.time = time
+
+    def time_analyze(self):
+        if not self.time:
+            return "Error: self.time is undefined."
+        
+        slow_rank = None
+        slow_time = None
+        mean_time = 0
+        max_ratio = 0
+
+        time = list(self.time.values())
+        # 耗时极值编号和数据
+        slow_time = max(time)
+        slow_rank = max(self.time, key=self.time.get)
+
+        # 计算快慢差异
+        mean_time = sum(time) / len(time)
+        max_ratio = (slow_time - mean_time) / mean_time
+
+        # 判断是否存在问题
+        ratio_threshold = 0.05
+        if max_ratio > ratio_threshold:
+            isproblem = True
+        else:
+            isproblem = False
+
+        return slow_rank, slow_time, max_ratio, isproblem
\ No newline at end of file
-- 
Gitee