From 71fcc3fa2f526b007297a3daa95e36a139cff8bb Mon Sep 17 00:00:00 2001 From: aosudh <14716987+aosudh@user.noreply.gitee.com> Date: Mon, 9 Sep 2024 15:10:02 +0800 Subject: [PATCH 1/7] =?UTF-8?q?=E6=B7=BB=E5=8A=A0msprecheck1.0=E5=B7=A5?= =?UTF-8?q?=E5=85=B7=E4=B8=AD=E7=9A=84=E4=B8=80=E4=BA=9B=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- profiler/ms-pre-check/README.md | 36 ++++++++++ profiler/ms-pre-check/common/config.py | 90 ++++++++++++++++++++++++ profiler/ms-pre-check/common/constant.py | 36 ++++++++++ 3 files changed, 162 insertions(+) create mode 100644 profiler/ms-pre-check/README.md create mode 100644 profiler/ms-pre-check/common/config.py create mode 100644 profiler/ms-pre-check/common/constant.py diff --git a/profiler/ms-pre-check/README.md b/profiler/ms-pre-check/README.md new file mode 100644 index 0000000000..c8c2580175 --- /dev/null +++ b/profiler/ms-pre-check/README.md @@ -0,0 +1,36 @@ +# MsPreCheck + +#### 介绍 +MsPreCheck1.0是一个一键性能预检工具,能够快速分析集群计算与通信性能是否达到标杆值。运行完成后能够在命令行窗口与文件夹中生成对应的性能数据。 + +#### 软件架构 +- analyse——analyse 主要分析模块,数据打屏模块,csv数据生成模块 +- common——config 设定算子名称与算子对应性能标杆 + ——constant 设定标定数值 + ——utils 数据打屏模块 +- entrance——entrance 数据采集模块,主程序 +- manager——group_manager 通信域构建与环境变量收集模块 +- pre_check——check 主入口 +- test_op 各算子测试模块 + +#### 安装教程 +1. 确保本机已经安装了昇腾NPU卡驱动包与对应的CANN包。 +2. 克隆代码仓库(请替换``为实际的仓库URL):`git clone ` +3. 新建conda环境,选择python版本为python=3.8:`conda create -n ms_pre_check python=3.8 conda activate ms_pre_check` +4. 进入ms-pre-check主文件夹中,运行:`pip install -r requirements.txt` + +#### 使用说明 + +##### 单机使用 +修改`run.sh`中的命令为: +`torchrun --nnodes=1 --nproc_per_node=8 --node_rank=0 --master_addr="127.0.0.1" --master_port=29500 ./pre_check/check.py` + +##### 多机使用 +修改`run.sh`中的命令为: +`torchrun --nnodes= --nproc_per_node= --node_rank= --master_addr="" --master_port= ./pre_check/check.py` +其中 +- `--nnodes=` 多机节点的总数量 +- `--nproc_per_node=` 单机内卡的数量 +- `--node_rank=` 机器的优先级,按0到(n-1)在每台机器上依次排序(假设总共有n台机器) +- `--master_addr=` 优先级为0的节点的IP地址 +- `--master_port=` 设置的端口,不被占用不冲突即可 \ No newline at end of file diff --git a/profiler/ms-pre-check/common/config.py b/profiler/ms-pre-check/common/config.py new file mode 100644 index 0000000000..8604dc9e1f --- /dev/null +++ b/profiler/ms-pre-check/common/config.py @@ -0,0 +1,90 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from enum import Enum +from common.constant import Constant +from test_op.test_allreduce import TestAllReduce +from test_op.test_matmul import TestMatMul +from test_op.test_mul import TestMul +from test_op.test_allgather import TestAllGather +from test_op.test_alltoall import TestAlltoAll +from test_op.test_reducescatter import TestReduceScatter + +class OpLevel(Enum): + MUL = 0 + MATMUL = 1 + ALL_REDUCE = 2 + REDUCE_SCATTER = 3 + ALL_GATHER = 4 + ALL_to_ALL = 5 + +class DataLevel(Enum): + RANK_ID = 0 + E2E_TIME = 1 + TOTAL_TIME = 2 + FREE_TIME = 3 + MAX_TIME = 4 + MIN_TIME = 5 + SIZE = 6 + BANDWIDTH = 7 + COUNT = 8 + +class Config: + CHECK_LIST = { + OpLevel.MUL: TestMul, + OpLevel.MATMUL: TestMatMul, + OpLevel.ALL_REDUCE: TestAllReduce, + OpLevel.REDUCE_SCATTER: TestReduceScatter, + OpLevel.ALL_GATHER: TestAllGather, + OpLevel.ALL_to_ALL: TestAlltoAll, + } + + BANDWIDTH_BENCHMARK_TABLE = { + OpLevel.ALL_REDUCE.name: { + 2: 56.46223137, + 4: 56.35505051, + 8: 56.14190492, + 16: 55.38822694, + 32: 53.05167839, + 64: 48.87642871, + }, + OpLevel.REDUCE_SCATTER.name: { + 2: 112.9244627 * Constant.BANDWIDTH_RATIO, + 4: 112.710101 * Constant.BANDWIDTH_RATIO, + 8: 112.2838098 * Constant.BANDWIDTH_RATIO, + 16: 110.7764539 * Constant.BANDWIDTH_RATIO, + 32: 106.1033568 * Constant.BANDWIDTH_RATIO, + 64: 97.75285743 * Constant.BANDWIDTH_RATIO, + }, + OpLevel.ALL_GATHER.name: { + 2: 112.9244627 * Constant.BANDWIDTH_RATIO, + 4: 112.710101 * Constant.BANDWIDTH_RATIO, + 8: 112.2838098 * Constant.BANDWIDTH_RATIO, + 16: 110.7764539 * Constant.BANDWIDTH_RATIO, + 32: 106.1033568 * Constant.BANDWIDTH_RATIO, + 64: 97.75285743 * Constant.BANDWIDTH_RATIO, + + }, + OpLevel.ALL_to_ALL.name: { + 2: 29.264 * Constant.BANDWIDTH_RATIO, + 4: 22.476 * Constant.BANDWIDTH_RATIO, + 8: 20.109 * Constant.BANDWIDTH_RATIO, + 16: 19.048 * Constant.BANDWIDTH_RATIO, + 32: 18.455 * Constant.BANDWIDTH_RATIO, + 64: 17.974 * Constant.BANDWIDTH_RATIO, + + }, + + + } \ No newline at end of file diff --git a/profiler/ms-pre-check/common/constant.py b/profiler/ms-pre-check/common/constant.py new file mode 100644 index 0000000000..575f2c7d44 --- /dev/null +++ b/profiler/ms-pre-check/common/constant.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import stat + +class Constant: + RANK_ID = "RANK_ID" + OP_TYPE = "OP_TYPE" + E2E_TIME = "E2E_TIME(ms)" + TOTAL_TIME = "TOTAL_TIME(ms)" + FREE_TIME = "FREE_TIME(ms)" + MAX_TIME = "MAX_TIME(ms)" + MIN_TIME = "MIN_TIME(ms)" + SIZE = "SIZE(GB)" + BANDWIDTH = "BANDWIDTH(GB/s)" + BANDWIDTH_BENCHMARK = "BANDWIDTH_BENCHMARK" + COUNT = "COUNT" + BYTE_SIZE = 1024#好像没用到 + MILLI_SECOND = 1000 + DATA_LEN = 8192 + NA = "N/A" + BANDWIDTH_RATIO = 0.9 + WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP + WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_APPEND \ No newline at end of file -- Gitee From 22403abf51f9caa9404ec43739b8c67af161955e Mon Sep 17 00:00:00 2001 From: aosudh <14716987+aosudh@user.noreply.gitee.com> Date: Mon, 9 Sep 2024 15:22:02 +0800 Subject: [PATCH 2/7] =?UTF-8?q?=E6=B7=BB=E5=8A=A0test=5Fop=E7=88=B6?= =?UTF-8?q?=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- profiler/ms-pre-check/test_op/test_op.py | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 profiler/ms-pre-check/test_op/test_op.py diff --git a/profiler/ms-pre-check/test_op/test_op.py b/profiler/ms-pre-check/test_op/test_op.py new file mode 100644 index 0000000000..78bf749bdf --- /dev/null +++ b/profiler/ms-pre-check/test_op/test_op.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +from dataclasses import dataclass +from common.constant import Constant + +@dataclass +class Statistics: + rank_id: int = -1 + e2e_time: float = 0 + total_time: float = 0 + free_time: float = 0 + max_time: float = 0 + min_time: float = sys.float_info.max + size: float = 0 + bandwidth: float = 0 + count: int = 0 + +class TestOp: + def __init__(self : any, pre_train: int=10, train: int=100, rank: int = 0) -> None: + self.pre_train = pre_train + self.train = train + self.rank = rank + def calculate_tensor_size_g(self: any, tensor: any) -> float: + return tensor.numel() * tensor.element_size() / Constant.BYTE_SIZE / Constant.BYTE_SIZE / Constant.BYTE_SIZE + def run(self): + pass \ No newline at end of file -- Gitee From 58f0ee92e76522f5e77ed8dc382124f38b83eaf3 Mon Sep 17 00:00:00 2001 From: aosudh <14716987+aosudh@user.noreply.gitee.com> Date: Mon, 9 Sep 2024 15:44:36 +0800 Subject: [PATCH 3/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=9D=E6=AC=A1?= =?UTF-8?q?=E5=90=88=E5=85=A5=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- profiler/ms-pre-check/LICENSE | 14 +++ profiler/ms-pre-check/analyse/__init__.py | 0 profiler/ms-pre-check/analyse/analyse.py | 101 ++++++++++++++++++ profiler/ms-pre-check/common/__init__.py | 0 profiler/ms-pre-check/common/config.py | 90 ---------------- profiler/ms-pre-check/common/utils.py | 55 ++++++++++ profiler/ms-pre-check/entrance/__init__.py | 0 profiler/ms-pre-check/entrance/entrance.py | 72 +++++++++++++ profiler/ms-pre-check/manager/__init__.py | 0 .../ms-pre-check/manager/group_manager.py | 53 +++++++++ profiler/ms-pre-check/pre_check/__init__.py | 0 .../constant.py => pre_check/check.py} | 31 ++---- profiler/ms-pre-check/run.sh | 2 + profiler/ms-pre-check/test_op/__init__.py | 0 14 files changed, 307 insertions(+), 111 deletions(-) create mode 100644 profiler/ms-pre-check/LICENSE create mode 100644 profiler/ms-pre-check/analyse/__init__.py create mode 100644 profiler/ms-pre-check/analyse/analyse.py create mode 100644 profiler/ms-pre-check/common/__init__.py delete mode 100644 profiler/ms-pre-check/common/config.py create mode 100644 profiler/ms-pre-check/common/utils.py create mode 100644 profiler/ms-pre-check/entrance/__init__.py create mode 100644 profiler/ms-pre-check/entrance/entrance.py create mode 100644 profiler/ms-pre-check/manager/__init__.py create mode 100644 profiler/ms-pre-check/manager/group_manager.py create mode 100644 profiler/ms-pre-check/pre_check/__init__.py rename profiler/ms-pre-check/{common/constant.py => pre_check/check.py} (50%) create mode 100644 profiler/ms-pre-check/run.sh create mode 100644 profiler/ms-pre-check/test_op/__init__.py diff --git a/profiler/ms-pre-check/LICENSE b/profiler/ms-pre-check/LICENSE new file mode 100644 index 0000000000..a7e3dc0765 --- /dev/null +++ b/profiler/ms-pre-check/LICENSE @@ -0,0 +1,14 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/profiler/ms-pre-check/analyse/__init__.py b/profiler/ms-pre-check/analyse/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/profiler/ms-pre-check/analyse/analyse.py b/profiler/ms-pre-check/analyse/analyse.py new file mode 100644 index 0000000000..90432358d8 --- /dev/null +++ b/profiler/ms-pre-check/analyse/analyse.py @@ -0,0 +1,101 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from common.constant import Constant +from common.config import OpLevel,DataLevel,Config +from common.utils import get_current_time_str,create_csv_writer +from manager.group_manager import GroupManager +from prettytable import PrettyTable +class Analyse: + def analyse(self, rank: int, gather_list: list) -> bool: + ''' + Analyse 主要对外入口 + ''' + if rank != 0: + return True + csv_path=os.path.abspath(os.path.join(f"./analyse_{get_current_time_str()}.csv")) + for node_id,node_tensor in enumerate(gather_list): + node_tensor_list = node_tensor.tolist() + for rank_id,tensor_data in enumerate(node_tensor_list): + complete_data = self._add_data(tensor_data) + self._print_result(complete_data) + self._write_csv(csv_path,node_id * GroupManager().get_rank_size() + rank_id, complete_data) + print(f'For further evaluation, please open csv file on {csv_path}') + return True + def _add_data(self, tensor_data: list) -> list: + ''' + tensor input data to dict + ''' + keys=[ + Constant.RANK_ID, Constant.E2E_TIME, Constant.TOTAL_TIME, Constant.FREE_TIME, + Constant.MAX_TIME, Constant. MIN_TIME,Constant. SIZE,Constant. BANDWIDTH, Constant.COUNT + ] + result=[] + for index, tensor in enumerate(tensor_data): + converted_dict = dict(zip(keys, tensor)) + op_type = OpLevel(index).name + converted_dict[Constant.OP_TYPE] = op_type + if op_type in (OpLevel.MUL.name, OpLevel.MATMUL.name,): + converted_dict[Constant.BANDWIDTH] = Constant.NA + converted_dict[Constant.BANDWIDTH_BENCHMARK] = Constant.NA + else: + bench_mark = Config.BANDWIDTH_BENCHMARK_TABLE.get(op_type).get(GroupManager().get_rank_size()) + converted_dict[Constant.BANDWIDTH_BENCHMARK] = "PASS" if converted_dict[Constant.BANDWIDTH] >= bench_mark else "FAIL" + result.append(converted_dict) + return result + + def _print_result(self, complete_data: list): + table_data={} + for card in complete_data: + rank_id = card['RANK_ID'] + op_type = card['OP_TYPE'] + bandwidth_benchmark = card['BANDWIDTH_BENCHMARK'] + if rank_id not in table_data: + table_data[rank_id] = {} + table_data[rank_id][op_type]=bandwidth_benchmark + all_op_types = [card['OP_TYPE'] for card in complete_data] + table_rows=[ + [rank_id] + [table_data[rank_id].get(op_type, 'N/A') for op_type in all_op_types] for rank_id in sorted(table_data.keys()) + ] + headers=['RANK_ID'] + all_op_types + if rank_id == 0: + table = PrettyTable(headers) + table.add_rows(table_rows) + table.border = False + print(table) + elif 1<= rank_id <= 9: + print(' ',table_rows[0][0],' ',table_rows[0][1],' ',table_rows[0][2],' ',table_rows[0][3],' ',table_rows[0][4],' ',table_rows[0][5],' ',table_rows[0][6]) + elif 10<= rank_id: + print(' ',table_rows[0][0],' ',table_rows[0][1],' ',table_rows[0][2],' ',table_rows[0][3],' ',table_rows[0][4],' ',table_rows[0][5],' ',table_rows[0][6]) + ''' + for data in complete_data: + print(f"rank is {data.get(Constant.RANK_ID)}, op type is {data.get(Constant.OP_TYPE)}," + f"e2e_time is {data.get(Constant.E2E_TIME)}, total_time is {data.get(Constant.TOTAL_TIME)}," + f"free_time is {data.get(Constant.FREE_TIME)}, max_time is {data.get(Constant.MAX_TIME)}," + f"min_time is {data.get(Constant.MIN_TIME)}, size is {data.get(Constant.SIZE)} GB," + f"bandwidth is {data.get(Constant.BANDWIDTH)} GB/s," + f"bandwidth {data.get(Constant.BANDWIDTH_BENCHMARK)}" + ) + ''' + def _write_csv(self, csv_path: str, rank: int, complete_data: list): + ''' + 创建csv文件 + ''' + if not complete_data: + print("ERROR no data.") + return + headers = list(complete_data[0].keys()) if rank == 0 else [] + list_data = [list(d.values()) for d in complete_data] + create_csv_writer(csv_path, headers, list_data) \ No newline at end of file diff --git a/profiler/ms-pre-check/common/__init__.py b/profiler/ms-pre-check/common/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/profiler/ms-pre-check/common/config.py b/profiler/ms-pre-check/common/config.py deleted file mode 100644 index 8604dc9e1f..0000000000 --- a/profiler/ms-pre-check/common/config.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2024 Huawei Technologies Co., Ltd -# All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from enum import Enum -from common.constant import Constant -from test_op.test_allreduce import TestAllReduce -from test_op.test_matmul import TestMatMul -from test_op.test_mul import TestMul -from test_op.test_allgather import TestAllGather -from test_op.test_alltoall import TestAlltoAll -from test_op.test_reducescatter import TestReduceScatter - -class OpLevel(Enum): - MUL = 0 - MATMUL = 1 - ALL_REDUCE = 2 - REDUCE_SCATTER = 3 - ALL_GATHER = 4 - ALL_to_ALL = 5 - -class DataLevel(Enum): - RANK_ID = 0 - E2E_TIME = 1 - TOTAL_TIME = 2 - FREE_TIME = 3 - MAX_TIME = 4 - MIN_TIME = 5 - SIZE = 6 - BANDWIDTH = 7 - COUNT = 8 - -class Config: - CHECK_LIST = { - OpLevel.MUL: TestMul, - OpLevel.MATMUL: TestMatMul, - OpLevel.ALL_REDUCE: TestAllReduce, - OpLevel.REDUCE_SCATTER: TestReduceScatter, - OpLevel.ALL_GATHER: TestAllGather, - OpLevel.ALL_to_ALL: TestAlltoAll, - } - - BANDWIDTH_BENCHMARK_TABLE = { - OpLevel.ALL_REDUCE.name: { - 2: 56.46223137, - 4: 56.35505051, - 8: 56.14190492, - 16: 55.38822694, - 32: 53.05167839, - 64: 48.87642871, - }, - OpLevel.REDUCE_SCATTER.name: { - 2: 112.9244627 * Constant.BANDWIDTH_RATIO, - 4: 112.710101 * Constant.BANDWIDTH_RATIO, - 8: 112.2838098 * Constant.BANDWIDTH_RATIO, - 16: 110.7764539 * Constant.BANDWIDTH_RATIO, - 32: 106.1033568 * Constant.BANDWIDTH_RATIO, - 64: 97.75285743 * Constant.BANDWIDTH_RATIO, - }, - OpLevel.ALL_GATHER.name: { - 2: 112.9244627 * Constant.BANDWIDTH_RATIO, - 4: 112.710101 * Constant.BANDWIDTH_RATIO, - 8: 112.2838098 * Constant.BANDWIDTH_RATIO, - 16: 110.7764539 * Constant.BANDWIDTH_RATIO, - 32: 106.1033568 * Constant.BANDWIDTH_RATIO, - 64: 97.75285743 * Constant.BANDWIDTH_RATIO, - - }, - OpLevel.ALL_to_ALL.name: { - 2: 29.264 * Constant.BANDWIDTH_RATIO, - 4: 22.476 * Constant.BANDWIDTH_RATIO, - 8: 20.109 * Constant.BANDWIDTH_RATIO, - 16: 19.048 * Constant.BANDWIDTH_RATIO, - 32: 18.455 * Constant.BANDWIDTH_RATIO, - 64: 17.974 * Constant.BANDWIDTH_RATIO, - - }, - - - } \ No newline at end of file diff --git a/profiler/ms-pre-check/common/utils.py b/profiler/ms-pre-check/common/utils.py new file mode 100644 index 0000000000..57d34eb2a5 --- /dev/null +++ b/profiler/ms-pre-check/common/utils.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import csv +import os +from datetime import datetime +from datetime import timezone +from common.constant import Constant + +def create_csv_writer(csv_file: str, headers: list,data: list): + with FdOpen(csv_file, newline='', operate = 'a') as _csv_file: + writer = csv.writer(_csv_file) + if headers: + writer.writerow(headers) + slice_count = len(data) // Constant.DATA_LEN + for index in range(slice_count): + writer.writerows(data[index * Constant.DATA_LEN:(index + 1) * Constant.DATA_LEN]) + writer.writerows(data[slice_count * Constant.DATA_LEN:]) +def get_current_time_str() -> str: + utc_time = datetime.now(tz=timezone.utc) + current_time = utc_time.replace(tzinfo=timezone.utc).astimezone(tz=None) + return current_time.strftime("%Y%m%d%H%M%S") +class FdOpen: + def __init__(self: any, file_path: str, flags: int = Constant.WRITE_FLAGS, mode: int = Constant.WRITE_MODES, operate: str = 'w', newline: str = None) -> None: + self.file_path = file_path + self.flags = flags + self.mode = mode + self.operate = operate + self.newline = newline + self.fd = None + self.file_open = None + + def __enter__(self: any) -> any: + self.fd = os.open(self.file_path, self.flags, self.mode) + if self.newline is None: + self.file_open = os.fdopen(self.fd, self.operate) + else: + self.file_open = os.fdopen(self.fd, self.operate, newline=self.newline) + return self.file_open + def __exit__(self, exc_type, exc_val, exc_tb): + if self.file_open: + self.file_open.close() + elif self.fd: + os.close(self.fd) \ No newline at end of file diff --git a/profiler/ms-pre-check/entrance/__init__.py b/profiler/ms-pre-check/entrance/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/profiler/ms-pre-check/entrance/entrance.py b/profiler/ms-pre-check/entrance/entrance.py new file mode 100644 index 0000000000..3126bed747 --- /dev/null +++ b/profiler/ms-pre-check/entrance/entrance.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +import os +from typing import List + +import torch +import torch_npu +import torch.distributed as dist + +from analyse.analyse import Analyse +from common.config import Config +from manager.group_manager import GroupManager +from test_op.test_op import Statistics + +class Entrance: + statistics_list = [] + def generate_tensor(self, statistics_list: List[Statistics], local_rank) -> any: + return torch.Tensor([[s.rank_id, s.e2e_time, s.total_time, s.free_time, s.max_time, s.min_time, + s.size, s.bandwidth,s.count ] for s in statistics_list]).npu(local_rank) + def gather_rank_data(self,tensor,local_rank) -> list: + print(f"gather rank data, {local_rank}") + dist.barrier() + gather_list = [] + rank_size = GroupManager().get_rank_size() + rank = GroupManager().get_rank() + if local_rank == 0: + for _ in range(rank_size): + gather_list.append((torch.zeros(len(Config.CHECK_LIST),9)).npu(local_rank)) + dist.gather(tensor, gather_list = gather_list, dst= rank // rank_size * rank_size,group = GroupManager().get_local_group()) + if rank % rank_size == 0: + rank_tensor = torch.stack(gather_list) + gather_list = [] + if rank == 0: + for _ in range(GroupManager().get_world_size() // rank_size): + gather_list.append((torch.zeros(rank_size, len(Config.CHECK_LIST),9)).npu(local_rank)) + dist.gather(rank_tensor, gather_list = gather_list, dst= 0, group=GroupManager().get_gather_group()) + return gather_list + + def run(self): + GroupManager() + rank = GroupManager().get_rank() + local_rank = GroupManager().get_local_rank() + print(f"start run, rank is {rank}, local_rank is {local_rank}") + index=1 + while True: + statistics_list = ['']* len(Config.CHECK_LIST) + for key, value in Config.CHECK_LIST.items(): + statistics_list[key.value] = value(rank=local_rank).run() + if local_rank == 0: + print(str(key).split('.')[-1] + " end at group " + str(index-1)) + tensor = self.generate_tensor(statistics_list, local_rank) + gather_list = self.gather_rank_data(tensor, local_rank) + if Analyse().analyse(rank=rank, gather_list=gather_list): + break + index += 1 + def main(self: any) -> None: + torch.npu.set_device(int(os.environ['LOCAL_RANK'])) + dist.init_process_group(backend='hccl', rank=int(os.environ['RANK']), world_size=int(os.environ['WORLD_SIZE']), timeout=datetime.timedelta(seconds=1800)) + self.run() \ No newline at end of file diff --git a/profiler/ms-pre-check/manager/__init__.py b/profiler/ms-pre-check/manager/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/profiler/ms-pre-check/manager/group_manager.py b/profiler/ms-pre-check/manager/group_manager.py new file mode 100644 index 0000000000..8ff542363b --- /dev/null +++ b/profiler/ms-pre-check/manager/group_manager.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +import os +import torch +import torch.distributed as dist +from common.singleton import singleton +@singleton +class GroupManager: + def __init__(self:any)->None: + self._rank = int(os.environ['RANK']) + self._local_rank = int(os.environ['LOCAL_RANK']) + self._world_size = int(os.environ['WORLD_SIZE']) + self._group_rank = int(os.environ['GROUP_RANK']) + self._rank_size = int(os.environ['LOCAL_WORLD_SIZE']) + self._local_group = None + self._gather_group = None + def get_rank(self): + return self._rank + def get_local_rank(self): + return self._local_rank + def get_world_size(self): + return self._world_size + def get_rank_size(self): + return self._rank_size + def get_local_group(self): + if self._local_group is None: + groups=[x for x in range(self._group_rank * self._rank_size , (self._group_rank+1) * self._rank_size )] + if self._local_rank == 0: + print(f"local groups are : {groups}") + self._local_group = dist.new_group(ranks = groups, timeout = datetime.timedelta(seconds=1800)) + return self._local_group + def get_gather_group(self): + if self._gather_group is None: + groups = [x for x in range(self._world_size) if x % self._rank_size == 0] + if self._local_rank == 0: + print(f"gather groups are : {groups}") + self._gather_group = dist.new_group(ranks = groups, timeout = datetime.timedelta(seconds=1800)) + return self._gather_group + + \ No newline at end of file diff --git a/profiler/ms-pre-check/pre_check/__init__.py b/profiler/ms-pre-check/pre_check/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/profiler/ms-pre-check/common/constant.py b/profiler/ms-pre-check/pre_check/check.py similarity index 50% rename from profiler/ms-pre-check/common/constant.py rename to profiler/ms-pre-check/pre_check/check.py index 575f2c7d44..46a8a2ad43 100644 --- a/profiler/ms-pre-check/common/constant.py +++ b/profiler/ms-pre-check/pre_check/check.py @@ -12,25 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import importlib import os -import stat - -class Constant: - RANK_ID = "RANK_ID" - OP_TYPE = "OP_TYPE" - E2E_TIME = "E2E_TIME(ms)" - TOTAL_TIME = "TOTAL_TIME(ms)" - FREE_TIME = "FREE_TIME(ms)" - MAX_TIME = "MAX_TIME(ms)" - MIN_TIME = "MIN_TIME(ms)" - SIZE = "SIZE(GB)" - BANDWIDTH = "BANDWIDTH(GB/s)" - BANDWIDTH_BENCHMARK = "BANDWIDTH_BENCHMARK" - COUNT = "COUNT" - BYTE_SIZE = 1024#好像没用到 - MILLI_SECOND = 1000 - DATA_LEN = 8192 - NA = "N/A" - BANDWIDTH_RATIO = 0.9 - WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP - WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_APPEND \ No newline at end of file +import sys +if __name__ == '__main__': + sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__),'..'))) + MODEL_PATH = "entrance.entrance" + ENTRANCE_CLASS = "Entrance" + os.umask(0o027) + entrance_module = importlib.import_module(MODEL_PATH) + if hasattr(entrance_module,ENTRANCE_CLASS): + getattr(entrance_module,ENTRANCE_CLASS)().main() \ No newline at end of file diff --git a/profiler/ms-pre-check/run.sh b/profiler/ms-pre-check/run.sh new file mode 100644 index 0000000000..36243970f9 --- /dev/null +++ b/profiler/ms-pre-check/run.sh @@ -0,0 +1,2 @@ +torchrun --nnodes=1 --nproc_per_node=8 --node_rank=0 --master_addr="127.0.0.1" --master_port=29500 ./pre_check/check.py +#torchrun --nnodes=2 --nproc_per_node=8 --node_rank=0 --master_addr="192.168.1.1" --master_port=29500 ./pre_check/check.py \ No newline at end of file diff --git a/profiler/ms-pre-check/test_op/__init__.py b/profiler/ms-pre-check/test_op/__init__.py new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 402de45d8b97043af2f2b69811020a7c0f637088 Mon Sep 17 00:00:00 2001 From: aosudh <14716987+aosudh@user.noreply.gitee.com> Date: Thu, 12 Sep 2024 07:05:41 +0000 Subject: [PATCH 4/7] update profiler/ms-pre-check/run.sh. Signed-off-by: aosudh <14716987+aosudh@user.noreply.gitee.com> --- profiler/ms-pre-check/run.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/profiler/ms-pre-check/run.sh b/profiler/ms-pre-check/run.sh index 36243970f9..47b65c6156 100644 --- a/profiler/ms-pre-check/run.sh +++ b/profiler/ms-pre-check/run.sh @@ -1,2 +1 @@ -torchrun --nnodes=1 --nproc_per_node=8 --node_rank=0 --master_addr="127.0.0.1" --master_port=29500 ./pre_check/check.py -#torchrun --nnodes=2 --nproc_per_node=8 --node_rank=0 --master_addr="192.168.1.1" --master_port=29500 ./pre_check/check.py \ No newline at end of file +torchrun --nnodes=1 --nproc_per_node=8 --node_rank=0 --master_addr="127.0.0.1" --master_port=29500 ./pre_check/check.py \ No newline at end of file -- Gitee From 80f0b4e58774e9229e2f365897ac6e5be0c78f21 Mon Sep 17 00:00:00 2001 From: aosudh <14716987+aosudh@user.noreply.gitee.com> Date: Fri, 13 Sep 2024 10:36:50 +0800 Subject: [PATCH 5/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=A3=80=E8=A7=86?= =?UTF-8?q?=E6=84=8F=E8=A7=81=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{analyse => analyze}/__init__.py | 0 .../analyse.py => analyze/analyze.py} | 38 ++-- profiler/ms-pre-check/common/path_manager.py | 205 ++++++++++++++++++ profiler/ms-pre-check/common/utils.py | 9 +- profiler/ms-pre-check/entrance/entrance.py | 15 +- .../ms-pre-check/manager/group_manager.py | 26 ++- profiler/ms-pre-check/pre_check/check.py | 2 +- 7 files changed, 256 insertions(+), 39 deletions(-) rename profiler/ms-pre-check/{analyse => analyze}/__init__.py (100%) rename profiler/ms-pre-check/{analyse/analyse.py => analyze/analyze.py} (71%) create mode 100644 profiler/ms-pre-check/common/path_manager.py diff --git a/profiler/ms-pre-check/analyse/__init__.py b/profiler/ms-pre-check/analyze/__init__.py similarity index 100% rename from profiler/ms-pre-check/analyse/__init__.py rename to profiler/ms-pre-check/analyze/__init__.py diff --git a/profiler/ms-pre-check/analyse/analyse.py b/profiler/ms-pre-check/analyze/analyze.py similarity index 71% rename from profiler/ms-pre-check/analyse/analyse.py rename to profiler/ms-pre-check/analyze/analyze.py index 90432358d8..f5313e2468 100644 --- a/profiler/ms-pre-check/analyse/analyse.py +++ b/profiler/ms-pre-check/analyze/analyze.py @@ -18,20 +18,20 @@ from common.config import OpLevel,DataLevel,Config from common.utils import get_current_time_str,create_csv_writer from manager.group_manager import GroupManager from prettytable import PrettyTable -class Analyse: - def analyse(self, rank: int, gather_list: list) -> bool: +class Analysis: + def analyze(self, rank: int, gather_list: list) -> bool: ''' - Analyse 主要对外入口 + Analyze 主要对外入口 ''' if rank != 0: return True - csv_path=os.path.abspath(os.path.join(f"./analyse_{get_current_time_str()}.csv")) - for node_id,node_tensor in enumerate(gather_list): + csv_path=os.path.abspath(f"./analyze_{get_current_time_str()}.csv") + for node,node_tensor in enumerate(gather_list): node_tensor_list = node_tensor.tolist() for rank_id,tensor_data in enumerate(node_tensor_list): complete_data = self._add_data(tensor_data) self._print_result(complete_data) - self._write_csv(csv_path,node_id * GroupManager().get_rank_size() + rank_id, complete_data) + self._write_csv(csv_path,node * GroupManager().get_rank_size() + rank_id, complete_data) print(f'For further evaluation, please open csv file on {csv_path}') return True def _add_data(self, tensor_data: list) -> list: @@ -43,9 +43,9 @@ class Analyse: Constant.MAX_TIME, Constant. MIN_TIME,Constant. SIZE,Constant. BANDWIDTH, Constant.COUNT ] result=[] - for index, tensor in enumerate(tensor_data): + for op_index, tensor in enumerate(tensor_data): converted_dict = dict(zip(keys, tensor)) - op_type = OpLevel(index).name + op_type = OpLevel(op_index).name converted_dict[Constant.OP_TYPE] = op_type if op_type in (OpLevel.MUL.name, OpLevel.MATMUL.name,): converted_dict[Constant.BANDWIDTH] = Constant.NA @@ -58,14 +58,14 @@ class Analyse: def _print_result(self, complete_data: list): table_data={} - for card in complete_data: - rank_id = card['RANK_ID'] - op_type = card['OP_TYPE'] - bandwidth_benchmark = card['BANDWIDTH_BENCHMARK'] + for card_data in complete_data: + rank_id = card_data['RANK_ID'] + op_type = card_data['OP_TYPE'] + bandwidth_benchmark = card_data['BANDWIDTH_BENCHMARK'] if rank_id not in table_data: table_data[rank_id] = {} table_data[rank_id][op_type]=bandwidth_benchmark - all_op_types = [card['OP_TYPE'] for card in complete_data] + all_op_types = [card_data['OP_TYPE'] for card_data in complete_data] table_rows=[ [rank_id] + [table_data[rank_id].get(op_type, 'N/A') for op_type in all_op_types] for rank_id in sorted(table_data.keys()) ] @@ -79,16 +79,6 @@ class Analyse: print(' ',table_rows[0][0],' ',table_rows[0][1],' ',table_rows[0][2],' ',table_rows[0][3],' ',table_rows[0][4],' ',table_rows[0][5],' ',table_rows[0][6]) elif 10<= rank_id: print(' ',table_rows[0][0],' ',table_rows[0][1],' ',table_rows[0][2],' ',table_rows[0][3],' ',table_rows[0][4],' ',table_rows[0][5],' ',table_rows[0][6]) - ''' - for data in complete_data: - print(f"rank is {data.get(Constant.RANK_ID)}, op type is {data.get(Constant.OP_TYPE)}," - f"e2e_time is {data.get(Constant.E2E_TIME)}, total_time is {data.get(Constant.TOTAL_TIME)}," - f"free_time is {data.get(Constant.FREE_TIME)}, max_time is {data.get(Constant.MAX_TIME)}," - f"min_time is {data.get(Constant.MIN_TIME)}, size is {data.get(Constant.SIZE)} GB," - f"bandwidth is {data.get(Constant.BANDWIDTH)} GB/s," - f"bandwidth {data.get(Constant.BANDWIDTH_BENCHMARK)}" - ) - ''' def _write_csv(self, csv_path: str, rank: int, complete_data: list): ''' 创建csv文件 @@ -97,5 +87,5 @@ class Analyse: print("ERROR no data.") return headers = list(complete_data[0].keys()) if rank == 0 else [] - list_data = [list(d.values()) for d in complete_data] + list_data = [list(data.values()) for data in complete_data] create_csv_writer(csv_path, headers, list_data) \ No newline at end of file diff --git a/profiler/ms-pre-check/common/path_manager.py b/profiler/ms-pre-check/common/path_manager.py new file mode 100644 index 0000000000..a9c9481493 --- /dev/null +++ b/profiler/ms-pre-check/common/path_manager.py @@ -0,0 +1,205 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import re +import shutil +import platform + +from .constant import Constant + + +class PathManager: + MAX_PATH_LENGTH = 4096 + MAX_FILE_NAME_LENGTH = 255 + DATA_FILE_AUTHORITY = 0o640 + DATA_DIR_AUTHORITY = 0o750 + WINDOWS = "windows" + + @classmethod + def check_input_directory_path(cls, path: str): + """ + Function Description: + check whether the path is valid, some businesses can accept a path that does not exist, + so the function do not verify whether the path exists + Parameter: + path: the path to check, whether the incoming path is absolute or relative depends on the business + Exception Description: + when invalid data throw exception + """ + cls.input_path_common_check(path) + base_name = os.path.basename(path) + if os.path.isfile(path): + msg = f"Invalid input path which is a file path: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_input_file_path(cls, path: str): + """ + Function Description: + check whether the file path is valid, some businesses can accept a path that does not exist, + so the function do not verify whether the path exists + Parameter: + path: the file path to check, whether the incoming path is absolute or relative depends on the business + Exception Description: + when invalid data throw exception + """ + cls.input_path_common_check(path) + base_name = os.path.basename(path) + if os.path.isdir(path): + msg = f"Invalid input path which is a directory path: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_path_length(cls, path: str): + if len(path) > cls.MAX_PATH_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + path_split_list = path.split("/") + for path in path_split_list: + path_list = path.split("\\") + for name in path_list: + if len(name) > cls.MAX_FILE_NAME_LENGTH: + raise RuntimeError("Length of input path exceeds the limit.") + + @classmethod + def input_path_common_check(cls, path: str): + cls.check_path_length(path) + + if os.path.islink(path): + msg = f"Invalid input path which is a soft link." + raise RuntimeError(msg) + + if platform.system().lower() == cls.WINDOWS: + pattern = r'(\.|:|\\|/|_|-|\s|[~0-9a-zA-Z\u4e00-\u9fa5])+' + else: + pattern = r'(\.|/|_|-|\s|[~0-9a-zA-Z])+' + if not re.fullmatch(pattern, path): + msg = f"Invalid input path." + raise RuntimeError(msg) + + @classmethod + def check_path_owner_consistent(cls, path: str): + """ + Function Description: + check whether the path belong to process owner + Parameter: + path: the path to check + Exception Description: + when invalid path, prompt the user + """ + base_name = os.path.basename(path) + if not os.path.exists(path): + msg = f"Invalid path: {base_name}" + raise RuntimeError(msg) + if platform.system().lower() == cls.WINDOWS: + return + if os.stat(path).st_uid != os.getuid(): + check_msg = input("The path does not belong to you, do you want to continue? [y/n]") + if check_msg.lower() != "y": + raise RuntimeError("The user choose not to continue.") + + @classmethod + def check_path_writeable(cls, path): + """ + Function Description: + check whether the path is writable + Parameter: + path: the path to check + Exception Description: + when invalid data throw exception + """ + cls.check_path_owner_consistent(path) + if os.path.islink(path): + msg = f"Invalid path which is a soft link." + raise RuntimeError(msg) + base_name = os.path.basename(path) + if not os.access(path, os.W_OK): + msg = f"The path permission check failed: {base_name}" + raise RuntimeError(msg) + + @classmethod + def check_path_readable(cls, path): + """ + Function Description: + check whether the path is writable + Parameter: + path: the path to check + Exception Description: + when invalid data throw exception + """ + cls.check_path_owner_consistent(path) + if os.path.islink(path): + msg = f"Invalid path which is a soft link." + raise RuntimeError(msg) + base_name = os.path.basename(path) + if not os.access(path, os.R_OK): + msg = f"The path permission check failed: {base_name}" + raise RuntimeError(msg) + + @classmethod + def remove_path_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to remove path: {base_name}" + cls.check_path_writeable(path) + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + try: + shutil.rmtree(path) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def make_dir_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to make directory: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.makedirs(path, mode=cls.DATA_DIR_AUTHORITY) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def create_file_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to create file: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.close(os.open(path, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY)) + except Exception as err: + raise RuntimeError(msg) from err + + @classmethod + def get_realpath(cls, path: str) -> str: + if os.path.islink(path): + msg = f"Invalid input path which is a soft link." + raise RuntimeError(msg) + return os.path.realpath(path) + + @classmethod + def check_file_size(cls, file_path: str): + if not os.path.exists(file_path): + raise FileNotFoundError(f"The file {file_path} does not exists.") + file_size = os.path.getsize(file_path) + if file_size > Constant.MAX_FILE_SIZE_5_GB: + check_msg = input( + f"The file({file_path}) size exceeds the preset max value. Continue reading the file? [y/n]") + if check_msg.lower() != "y": + raise RuntimeError(f"[WARNING] The user choose not to read the file: {file_path}") diff --git a/profiler/ms-pre-check/common/utils.py b/profiler/ms-pre-check/common/utils.py index 57d34eb2a5..4a93ca43b5 100644 --- a/profiler/ms-pre-check/common/utils.py +++ b/profiler/ms-pre-check/common/utils.py @@ -17,9 +17,11 @@ import os from datetime import datetime from datetime import timezone from common.constant import Constant +from path_manager import PathManager -def create_csv_writer(csv_file: str, headers: list,data: list): - with FdOpen(csv_file, newline='', operate = 'a') as _csv_file: +def create_csv_writer(csv_file_path: str, headers: list,data: list): + PathManager.check_path_writeable(csv_file_path) + with FdOpen(csv_file_path, newline='', operate = 'a') as _csv_file: writer = csv.writer(_csv_file) if headers: writer.writerow(headers) @@ -27,10 +29,12 @@ def create_csv_writer(csv_file: str, headers: list,data: list): for index in range(slice_count): writer.writerows(data[index * Constant.DATA_LEN:(index + 1) * Constant.DATA_LEN]) writer.writerows(data[slice_count * Constant.DATA_LEN:]) + def get_current_time_str() -> str: utc_time = datetime.now(tz=timezone.utc) current_time = utc_time.replace(tzinfo=timezone.utc).astimezone(tz=None) return current_time.strftime("%Y%m%d%H%M%S") + class FdOpen: def __init__(self: any, file_path: str, flags: int = Constant.WRITE_FLAGS, mode: int = Constant.WRITE_MODES, operate: str = 'w', newline: str = None) -> None: self.file_path = file_path @@ -48,6 +52,7 @@ class FdOpen: else: self.file_open = os.fdopen(self.fd, self.operate, newline=self.newline) return self.file_open + def __exit__(self, exc_type, exc_val, exc_tb): if self.file_open: self.file_open.close() diff --git a/profiler/ms-pre-check/entrance/entrance.py b/profiler/ms-pre-check/entrance/entrance.py index 3126bed747..9b7544a562 100644 --- a/profiler/ms-pre-check/entrance/entrance.py +++ b/profiler/ms-pre-check/entrance/entrance.py @@ -20,7 +20,7 @@ import torch import torch_npu import torch.distributed as dist -from analyse.analyse import Analyse +from analyze.analyze import Analysis from common.config import Config from manager.group_manager import GroupManager from test_op.test_op import Statistics @@ -30,6 +30,7 @@ class Entrance: def generate_tensor(self, statistics_list: List[Statistics], local_rank) -> any: return torch.Tensor([[s.rank_id, s.e2e_time, s.total_time, s.free_time, s.max_time, s.min_time, s.size, s.bandwidth,s.count ] for s in statistics_list]).npu(local_rank) + def gather_rank_data(self,tensor,local_rank) -> list: print(f"gather rank data, {local_rank}") dist.barrier() @@ -57,15 +58,19 @@ class Entrance: index=1 while True: statistics_list = ['']* len(Config.CHECK_LIST) - for key, value in Config.CHECK_LIST.items(): - statistics_list[key.value] = value(rank=local_rank).run() + if Config.CHECK_LIST is None: + print("CHECK_LIST is None") + break + for op_name, op_func in Config.CHECK_LIST.items(): + statistics_list[op_name.op_func] = op_func(rank=local_rank).run() if local_rank == 0: - print(str(key).split('.')[-1] + " end at group " + str(index-1)) + print(str(op_name).split('.')[-1] + " end at group " + str(index-1)) tensor = self.generate_tensor(statistics_list, local_rank) gather_list = self.gather_rank_data(tensor, local_rank) - if Analyse().analyse(rank=rank, gather_list=gather_list): + if Analysis().analyze(rank=rank, gather_list=gather_list): break index += 1 + def main(self: any) -> None: torch.npu.set_device(int(os.environ['LOCAL_RANK'])) dist.init_process_group(backend='hccl', rank=int(os.environ['RANK']), world_size=int(os.environ['WORLD_SIZE']), timeout=datetime.timedelta(seconds=1800)) diff --git a/profiler/ms-pre-check/manager/group_manager.py b/profiler/ms-pre-check/manager/group_manager.py index 8ff542363b..b210f9bc1b 100644 --- a/profiler/ms-pre-check/manager/group_manager.py +++ b/profiler/ms-pre-check/manager/group_manager.py @@ -19,22 +19,30 @@ import torch.distributed as dist from common.singleton import singleton @singleton class GroupManager: + _initialized = False def __init__(self:any)->None: - self._rank = int(os.environ['RANK']) - self._local_rank = int(os.environ['LOCAL_RANK']) - self._world_size = int(os.environ['WORLD_SIZE']) - self._group_rank = int(os.environ['GROUP_RANK']) - self._rank_size = int(os.environ['LOCAL_WORLD_SIZE']) - self._local_group = None - self._gather_group = None + if not self._initialized: + self._initialized = True + self._rank = int(os.environ['RANK']) if str(os.environ['RANK']).isdigit() and 0 Date: Fri, 13 Sep 2024 15:26:30 +0800 Subject: [PATCH 6/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=B7=A5=E5=85=B7?= =?UTF-8?q?=E5=90=8D=E4=B8=8E=E6=96=87=E4=BB=B6=E5=A4=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- profiler/{ms-pre-check => precheck}/LICENSE | 0 profiler/{ms-pre-check => precheck}/README.md | 6 +++--- profiler/{ms-pre-check => precheck}/analyze/__init__.py | 0 profiler/{ms-pre-check => precheck}/analyze/analyze.py | 0 profiler/{ms-pre-check => precheck}/common/__init__.py | 0 profiler/{ms-pre-check => precheck}/common/path_manager.py | 0 profiler/{ms-pre-check => precheck}/common/utils.py | 0 profiler/{ms-pre-check => precheck}/entrance/__init__.py | 0 profiler/{ms-pre-check => precheck}/entrance/entrance.py | 0 profiler/{ms-pre-check => precheck}/manager/__init__.py | 0 .../{ms-pre-check => precheck}/manager/group_manager.py | 0 profiler/{ms-pre-check => precheck}/pre_check/__init__.py | 0 profiler/{ms-pre-check => precheck}/pre_check/check.py | 0 profiler/{ms-pre-check => precheck}/run.sh | 0 profiler/{ms-pre-check => precheck}/test_op/__init__.py | 0 profiler/{ms-pre-check => precheck}/test_op/test_op.py | 0 16 files changed, 3 insertions(+), 3 deletions(-) rename profiler/{ms-pre-check => precheck}/LICENSE (100%) rename profiler/{ms-pre-check => precheck}/README.md (83%) rename profiler/{ms-pre-check => precheck}/analyze/__init__.py (100%) rename profiler/{ms-pre-check => precheck}/analyze/analyze.py (100%) rename profiler/{ms-pre-check => precheck}/common/__init__.py (100%) rename profiler/{ms-pre-check => precheck}/common/path_manager.py (100%) rename profiler/{ms-pre-check => precheck}/common/utils.py (100%) rename profiler/{ms-pre-check => precheck}/entrance/__init__.py (100%) rename profiler/{ms-pre-check => precheck}/entrance/entrance.py (100%) rename profiler/{ms-pre-check => precheck}/manager/__init__.py (100%) rename profiler/{ms-pre-check => precheck}/manager/group_manager.py (100%) rename profiler/{ms-pre-check => precheck}/pre_check/__init__.py (100%) rename profiler/{ms-pre-check => precheck}/pre_check/check.py (100%) rename profiler/{ms-pre-check => precheck}/run.sh (100%) rename profiler/{ms-pre-check => precheck}/test_op/__init__.py (100%) rename profiler/{ms-pre-check => precheck}/test_op/test_op.py (100%) diff --git a/profiler/ms-pre-check/LICENSE b/profiler/precheck/LICENSE similarity index 100% rename from profiler/ms-pre-check/LICENSE rename to profiler/precheck/LICENSE diff --git a/profiler/ms-pre-check/README.md b/profiler/precheck/README.md similarity index 83% rename from profiler/ms-pre-check/README.md rename to profiler/precheck/README.md index c8c2580175..570b67ae96 100644 --- a/profiler/ms-pre-check/README.md +++ b/profiler/precheck/README.md @@ -1,7 +1,7 @@ -# MsPreCheck +# precheck #### 介绍 -MsPreCheck1.0是一个一键性能预检工具,能够快速分析集群计算与通信性能是否达到标杆值。运行完成后能够在命令行窗口与文件夹中生成对应的性能数据。 +precheck1.0是一个一键性能预检工具,能够快速分析集群计算与通信性能是否达到标杆值。运行完成后能够在命令行窗口与文件夹中生成对应的性能数据。 #### 软件架构 - analyse——analyse 主要分析模块,数据打屏模块,csv数据生成模块 @@ -17,7 +17,7 @@ MsPreCheck1.0是一个一键性能预检工具,能够快速分析集群计算 1. 确保本机已经安装了昇腾NPU卡驱动包与对应的CANN包。 2. 克隆代码仓库(请替换``为实际的仓库URL):`git clone ` 3. 新建conda环境,选择python版本为python=3.8:`conda create -n ms_pre_check python=3.8 conda activate ms_pre_check` -4. 进入ms-pre-check主文件夹中,运行:`pip install -r requirements.txt` +4. 进入profiler/precheck主文件夹中,运行:`pip install -r requirements.txt` #### 使用说明 diff --git a/profiler/ms-pre-check/analyze/__init__.py b/profiler/precheck/analyze/__init__.py similarity index 100% rename from profiler/ms-pre-check/analyze/__init__.py rename to profiler/precheck/analyze/__init__.py diff --git a/profiler/ms-pre-check/analyze/analyze.py b/profiler/precheck/analyze/analyze.py similarity index 100% rename from profiler/ms-pre-check/analyze/analyze.py rename to profiler/precheck/analyze/analyze.py diff --git a/profiler/ms-pre-check/common/__init__.py b/profiler/precheck/common/__init__.py similarity index 100% rename from profiler/ms-pre-check/common/__init__.py rename to profiler/precheck/common/__init__.py diff --git a/profiler/ms-pre-check/common/path_manager.py b/profiler/precheck/common/path_manager.py similarity index 100% rename from profiler/ms-pre-check/common/path_manager.py rename to profiler/precheck/common/path_manager.py diff --git a/profiler/ms-pre-check/common/utils.py b/profiler/precheck/common/utils.py similarity index 100% rename from profiler/ms-pre-check/common/utils.py rename to profiler/precheck/common/utils.py diff --git a/profiler/ms-pre-check/entrance/__init__.py b/profiler/precheck/entrance/__init__.py similarity index 100% rename from profiler/ms-pre-check/entrance/__init__.py rename to profiler/precheck/entrance/__init__.py diff --git a/profiler/ms-pre-check/entrance/entrance.py b/profiler/precheck/entrance/entrance.py similarity index 100% rename from profiler/ms-pre-check/entrance/entrance.py rename to profiler/precheck/entrance/entrance.py diff --git a/profiler/ms-pre-check/manager/__init__.py b/profiler/precheck/manager/__init__.py similarity index 100% rename from profiler/ms-pre-check/manager/__init__.py rename to profiler/precheck/manager/__init__.py diff --git a/profiler/ms-pre-check/manager/group_manager.py b/profiler/precheck/manager/group_manager.py similarity index 100% rename from profiler/ms-pre-check/manager/group_manager.py rename to profiler/precheck/manager/group_manager.py diff --git a/profiler/ms-pre-check/pre_check/__init__.py b/profiler/precheck/pre_check/__init__.py similarity index 100% rename from profiler/ms-pre-check/pre_check/__init__.py rename to profiler/precheck/pre_check/__init__.py diff --git a/profiler/ms-pre-check/pre_check/check.py b/profiler/precheck/pre_check/check.py similarity index 100% rename from profiler/ms-pre-check/pre_check/check.py rename to profiler/precheck/pre_check/check.py diff --git a/profiler/ms-pre-check/run.sh b/profiler/precheck/run.sh similarity index 100% rename from profiler/ms-pre-check/run.sh rename to profiler/precheck/run.sh diff --git a/profiler/ms-pre-check/test_op/__init__.py b/profiler/precheck/test_op/__init__.py similarity index 100% rename from profiler/ms-pre-check/test_op/__init__.py rename to profiler/precheck/test_op/__init__.py diff --git a/profiler/ms-pre-check/test_op/test_op.py b/profiler/precheck/test_op/test_op.py similarity index 100% rename from profiler/ms-pre-check/test_op/test_op.py rename to profiler/precheck/test_op/test_op.py -- Gitee From 8cba2a1cfc7d33919419245cce50dc739677cf70 Mon Sep 17 00:00:00 2001 From: aosudh <14716987+aosudh@user.noreply.gitee.com> Date: Fri, 13 Sep 2024 18:06:24 +0800 Subject: [PATCH 7/7] =?UTF-8?q?:=E4=BF=AE=E5=A4=8D=E6=A3=80=E8=A7=86?= =?UTF-8?q?=E6=84=8F=E8=A7=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- profiler/precheck/analyze/analyze.py | 57 +++++++++++----------- profiler/precheck/entrance/entrance.py | 14 +++--- profiler/precheck/manager/group_manager.py | 17 ++----- profiler/precheck/test_op/test_op.py | 16 +++--- 4 files changed, 48 insertions(+), 56 deletions(-) diff --git a/profiler/precheck/analyze/analyze.py b/profiler/precheck/analyze/analyze.py index f5313e2468..893b56de3c 100644 --- a/profiler/precheck/analyze/analyze.py +++ b/profiler/precheck/analyze/analyze.py @@ -19,22 +19,9 @@ from common.utils import get_current_time_str,create_csv_writer from manager.group_manager import GroupManager from prettytable import PrettyTable class Analysis: - def analyze(self, rank: int, gather_list: list) -> bool: - ''' - Analyze 主要对外入口 - ''' - if rank != 0: - return True - csv_path=os.path.abspath(f"./analyze_{get_current_time_str()}.csv") - for node,node_tensor in enumerate(gather_list): - node_tensor_list = node_tensor.tolist() - for rank_id,tensor_data in enumerate(node_tensor_list): - complete_data = self._add_data(tensor_data) - self._print_result(complete_data) - self._write_csv(csv_path,node * GroupManager().get_rank_size() + rank_id, complete_data) - print(f'For further evaluation, please open csv file on {csv_path}') - return True - def _add_data(self, tensor_data: list) -> list: + + @staticmethod + def _add_data(tensor_data: list) -> list: ''' tensor input data to dict ''' @@ -56,6 +43,18 @@ class Analysis: result.append(converted_dict) return result + @staticmethod + def _write_csv(csv_path: str, rank: int, complete_data: list): + ''' + 创建csv文件 + ''' + if not complete_data: + return + headers = list(complete_data[0].keys()) if rank == 0 else [] + list_data = [list(data.values()) for data in complete_data] + create_csv_writer(csv_path, headers, list_data) + + @staticmethod def _print_result(self, complete_data: list): table_data={} for card_data in complete_data: @@ -74,18 +73,18 @@ class Analysis: table = PrettyTable(headers) table.add_rows(table_rows) table.border = False - print(table) - elif 1<= rank_id <= 9: - print(' ',table_rows[0][0],' ',table_rows[0][1],' ',table_rows[0][2],' ',table_rows[0][3],' ',table_rows[0][4],' ',table_rows[0][5],' ',table_rows[0][6]) - elif 10<= rank_id: - print(' ',table_rows[0][0],' ',table_rows[0][1],' ',table_rows[0][2],' ',table_rows[0][3],' ',table_rows[0][4],' ',table_rows[0][5],' ',table_rows[0][6]) - def _write_csv(self, csv_path: str, rank: int, complete_data: list): + + def analyze(self, rank: int, gather_list: list) -> bool: ''' - 创建csv文件 + Analyze 主要对外入口 ''' - if not complete_data: - print("ERROR no data.") - return - headers = list(complete_data[0].keys()) if rank == 0 else [] - list_data = [list(data.values()) for data in complete_data] - create_csv_writer(csv_path, headers, list_data) \ No newline at end of file + if rank != 0: + return True + csv_path=os.path.abspath(f"./analyze_{get_current_time_str()}.csv") + for node,node_tensor in enumerate(gather_list): + node_tensor_list = node_tensor.tolist() + for rank_id,tensor_data in enumerate(node_tensor_list): + complete_data = self._add_data(tensor_data) + self._print_result(complete_data) + self._write_csv(csv_path,node * GroupManager().get_rank_size() + rank_id, complete_data) + return True \ No newline at end of file diff --git a/profiler/precheck/entrance/entrance.py b/profiler/precheck/entrance/entrance.py index 9b7544a562..eff09c4f23 100644 --- a/profiler/precheck/entrance/entrance.py +++ b/profiler/precheck/entrance/entrance.py @@ -27,12 +27,14 @@ from test_op.test_op import Statistics class Entrance: statistics_list = [] - def generate_tensor(self, statistics_list: List[Statistics], local_rank) -> any: + + @staticmethod + def generate_tensor(statistics_list: List[Statistics], local_rank) -> any: return torch.Tensor([[s.rank_id, s.e2e_time, s.total_time, s.free_time, s.max_time, s.min_time, s.size, s.bandwidth,s.count ] for s in statistics_list]).npu(local_rank) - - def gather_rank_data(self,tensor,local_rank) -> list: - print(f"gather rank data, {local_rank}") + + @staticmethod + def gather_rank_data(tensor, local_rank) -> list: dist.barrier() gather_list = [] rank_size = GroupManager().get_rank_size() @@ -54,17 +56,13 @@ class Entrance: GroupManager() rank = GroupManager().get_rank() local_rank = GroupManager().get_local_rank() - print(f"start run, rank is {rank}, local_rank is {local_rank}") index=1 while True: statistics_list = ['']* len(Config.CHECK_LIST) if Config.CHECK_LIST is None: - print("CHECK_LIST is None") break for op_name, op_func in Config.CHECK_LIST.items(): statistics_list[op_name.op_func] = op_func(rank=local_rank).run() - if local_rank == 0: - print(str(op_name).split('.')[-1] + " end at group " + str(index-1)) tensor = self.generate_tensor(statistics_list, local_rank) gather_list = self.gather_rank_data(tensor, local_rank) if Analysis().analyze(rank=rank, gather_list=gather_list): diff --git a/profiler/precheck/manager/group_manager.py b/profiler/precheck/manager/group_manager.py index b210f9bc1b..1737789ddb 100644 --- a/profiler/precheck/manager/group_manager.py +++ b/profiler/precheck/manager/group_manager.py @@ -23,11 +23,11 @@ class GroupManager: def __init__(self:any)->None: if not self._initialized: self._initialized = True - self._rank = int(os.environ['RANK']) if str(os.environ['RANK']).isdigit() and 0 float: + @staticmethod + def calculate_tensor_size_g(tensor: any) -> float: return tensor.numel() * tensor.element_size() / Constant.BYTE_SIZE / Constant.BYTE_SIZE / Constant.BYTE_SIZE + def run(self): pass \ No newline at end of file -- Gitee