From 4d30cc891fdcd322cd980976e276d9b22040dc03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 02:16:48 +0000 Subject: [PATCH 01/11] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20Tools?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Tools/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 Tools/.keep diff --git a/Tools/.keep b/Tools/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 4b2860dc06efeec365e41e1174303be8168f4c13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 02:19:42 +0000 Subject: [PATCH 02/11] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20ascend=5Fdistribute?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Tools/ascend_distribute/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 Tools/ascend_distribute/.keep diff --git a/Tools/ascend_distribute/.keep b/Tools/ascend_distribute/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 46097b3f89fb13451150ca85d903cb55ca33de02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 02:20:55 +0000 Subject: [PATCH 03/11] add tool ascend_distribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 傅国鑫 <745324460@qq.com> --- Tools/ascend_distribute/README.md | 155 +++++++++++++ Tools/ascend_distribute/common_flags.py | 160 +++++++++++++ Tools/ascend_distribute/config.py | 150 +++++++++++++ Tools/ascend_distribute/config_args.py | 108 +++++++++ Tools/ascend_distribute/config_dist.py | 106 +++++++++ Tools/ascend_distribute/config_utils.py | 50 +++++ Tools/ascend_distribute/distribute_npu.py | 47 ++++ Tools/ascend_distribute/multi_utils.py | 248 +++++++++++++++++++++ Tools/ascend_distribute/process_manage.py | 75 +++++++ Tools/ascend_distribute/process_monitor.py | 95 ++++++++ Tools/ascend_distribute/requirements.txt | 2 + Tools/ascend_distribute/train.py | 155 +++++++++++++ 12 files changed, 1351 insertions(+) create mode 100644 Tools/ascend_distribute/README.md create mode 100644 Tools/ascend_distribute/common_flags.py create mode 100644 Tools/ascend_distribute/config.py create mode 100644 Tools/ascend_distribute/config_args.py create mode 100644 Tools/ascend_distribute/config_dist.py create mode 100644 Tools/ascend_distribute/config_utils.py create mode 100644 Tools/ascend_distribute/distribute_npu.py create mode 100644 Tools/ascend_distribute/multi_utils.py create mode 100644 Tools/ascend_distribute/process_manage.py create mode 100644 Tools/ascend_distribute/process_monitor.py create mode 100644 Tools/ascend_distribute/requirements.txt create mode 100644 Tools/ascend_distribute/train.py diff --git a/Tools/ascend_distribute/README.md b/Tools/ascend_distribute/README.md new file mode 100644 index 000000000..988cfb85c --- /dev/null +++ b/Tools/ascend_distribute/README.md @@ -0,0 +1,155 @@ +# 分布式插件使用指南 + +## 1、分布式插件简介 + +本插件主要提供了基于昇腾AI处理器快速拉起分布式训练的方法,用户只需要输入单卡训练指令即可快速进行分布式训练,同时加入了AOE分布式梯度调优功能,用户可以在完成分布式训练后直接进行调优以提升分布式训练的性能 + + + +### 1.1、使用约束 + +本插件仅适用于TensorFlow1.X/2.X框架的训练网络 + +使用前请确保网络单卡训练指令可以正常在昇腾AI处理器上进行训练 + + + +### 1.2、环境准备 + +#### 1.2.1、训练环境准备 + +硬件环境和运行环境请参见《[CANN软件安装指南](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=installation-update)》 + + + +#### 1.2.2、插件依赖安装 + +运行以下命令安装分布式工具所需依赖 + +``` +pip3 install requirements.txt +``` + + + +#### 1.2.3、AOE工具安装 + +AOE工具的下载和安装请参见《AOE工具使用指南》,如需使用分布式插件中的AOE梯度调优功能请按照该指南中的说明进行AOE工具的下载和安装,如仅需调用知识库或不进行梯度调优则可以不安装使用AOE工具 + + + +## 2、分布式插件的使用 + +### 2.1、使用分布式插件运行单机或多机分布式训练 + +#### 2.1.1、使用流程 + +**单机多卡分布式训练** + +启动单机多卡训练时,用户通过--np参数指定总的训练卡数,用--env参数指定{服务器IP}:{device数量}:{device id}, + +**多机多卡分布式训练** + +启动多机多卡训练时,用户通过--np参数指定总的训练卡数,用--env参数指定{服务器IP}:{device数量}:{device id},多台机器间使用“,”进行分隔 + +#### 2.1.2、快速上手 + +**单机多卡分布式训练** + +使用以下单机8卡示例命令拉起单机多卡分布式训练 + +说明:--np 8设置在8卡上训练,--env {ip}:{device数量}:{device id}设置在指定IP服务器上的8张卡上进行训练,当设置8卡时可以不写:{device id},详细见《分布式插件参数说明章节》 + +``` +python3 distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_1p.sh --data_path=/npu/traindata" +``` + + + +使用以下单机4卡示例命令拉起单机非8卡分布式训练 + +说明:需要在--env 10.10.10.10后添加“:{device数量}:{device ID}”以指定使用的device数量和使用的device ID + +``` +python3 distrbute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train_1p.sh --data_path=/npu/traindata" +``` + + + +**多机多卡分布式训练** + +使用以下多机16卡示例命令拉起多机多卡分布式训练 + +说明:类似于单机多卡训练,--env 参数中不同机器用“,”分隔 + +``` +python3 distrbute_npu.py --np 8 --env 10.10.10.10:4:0123,10.10.10.11:4:0123 --train_command "bash train_1p.sh --data_path=/npu/traindata" +``` + + + +### 2.2、使用AOE工具进行分布式梯度切分调优 + +#### 2.2.1、调优流程 + +**使用AOE生成自定义知识库** + +当用户可以拉起单机多卡分布式训练后,可以开启AOE梯度调优,仅需在拉起单机分布式的命令后加一个--aoe=True的参数即可。执行该命令后,会默认在device0上拉起单个进程进行梯度调优,梯度调优结束后会生成一个{芯片名}_gradient_fusion.json的自定义知识库,例如Ascend910A_gradient_fusion.json + +说明:对于一个网络的某一个场景,AOE只用调优一次;对于已经进行过AOE梯度调优的网络,无需再次进行AOE + + + +**使用生成的知识库** + +AOE调优完毕后,会生成一个自定义知识库文件,通过环境变量调用知识库进行分布式训练 + + + +#### 2.2.2、快速上手 + +**使用AOE生成自定义知识库** + +使用以下示例中的命令进行AOE调优 + +``` +python3 distribute_npu.py --np 8 train_command "bash train_1p.sh --data_path=/npu/traindata" --aoe=True +``` + +说明:AOE调优前需确保该命令可以进行分布式训练,在可执行分布式训练的命令后添加 --aoe=True即可 + + + +**使用/禁用调优后生成的知识库** + +调优完毕后再次拉起分布式训练即可调用自定义知识库,当进行多机训练时会自动将自定义知识库传输到其他机器上 + +如果用户不想调用自定义知识库时可以按照以下示例在训练命令后添加 --use_library=False禁用知识库 + +``` +python3 distribute_npu.py --np 8 train_command "bash train_1p.sh --data_path=/npu/traindata" --use_library=False +``` + + + +## 3、常见问题处理 + +残留进程处理 + +报错 + + + +## 4、分布式插件参数说明 + +| 参数名 | 默认值 | 类型 | 参数说明 | +| --------------- | ------ | ---- | ------------------------------------------------------------ | +| -h 或 --help | 无 | 无 | 打印帮助信息,使用python3 distribute_npu.py --help打开帮助信息 | +| --env | None | 必须 | 环境信息,按照ip:device数量:device_id的格式进行输入,多机时请用','进行分隔。示例:--env 10.10.10.10:4:0123,10.10.10.11:4:1234 | +| --np | 8 | 必须 | 总共使用的device数量,默认为8卡。示例:--np 16 | +| --train_command | None | 必须 | 启动单卡训练的指令。示例:--train_command "bash train_1p.sh --data_path=/home/data" | +| --aoe | False | 可选 | 是否使用AOE工具进行分布式梯度调优,默认为False。使用 --aoe=True启动 | +| --use_library | | 可选 | 是否使用AOE调优生成的知识库,默认为True,当用户 | + + + diff --git a/Tools/ascend_distribute/common_flags.py b/Tools/ascend_distribute/common_flags.py new file mode 100644 index 000000000..1244f6578 --- /dev/null +++ b/Tools/ascend_distribute/common_flags.py @@ -0,0 +1,160 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -*- coding: utf-8 -*- +import argparse + + +def define_ascend_distribute_flags(): + parser = argparse.ArgumentParser("""NPU Distribute run commond + + ################################################################################################################################################################## + + WARNING: Users only need to focus on '--np', '--env', '--train_command', '--aoe' and '--use_library' parameters, do not change other parameters! + WARNING: Before using this tool, please ensure you can perform one-device training with the Ascend NPU, and ensure using this tool with the same training command! + WARNING: Before using this tool, users need to define a config file. For more details, please see "README.md". + + Users can use this tool easily with the follow examples: + common command format: python3 distribute_npu.py --np (total device mun) --env (ip):(device num):(device id) --train_command "onedevice training command" + for one-worker-multi-devices training: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" + for multi-workers-multi-devices training: python3 distribute_npu.py --np 8 --env 10.10.10.10:4:0123,10.10.10.11:4:0123 --train_command "bash train.sh" + for using AOE tuning tool: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" --aoe=True + for disable the AOE tuned bank file: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" --use_library=False + + ATTENTION: 1. After successful one-worker-multi-devices training, users can train with multi-workers, just need to modify the '--env' parameter. + 2. When setting the '--env', please using ',' to separate different workers, and do not forget to modify the config file which includes env info. + 3. After successful one-worker-multi-devices training, users can tune with the AOE tool, just need to add '--aoe=True' after the previous command. + 4. After AOE, if a 'xx_gradient_fusion.json' file generated in '/root/ascend_tools/ascend_distribute/custom_tune_bank/' directory, AOE is successful. + 5. Using AOE tuned file is default, users can set '--use_library=False' to disable using AOE tuned file. + + ################################################################################################################################################################## + """) + parser.add_argument( + "--config", + default=None, + help="Enter containing server ip:username:password.", + ) + + parser.add_argument( + "--np", + default=8, + type=int, + help="Necessary, the total number of devices used for training.", + ) + + parser.add_argument( + "--env", + default=None, + help="Necessary, environment information, please input with '--env {ip}:{device num}:{device ip}' format, when training with MultiWorker, please use ',' to separate different workers", + ) + + parser.add_argument( + "--train_command", + default=None, + type=str, + help="Necessary, training command, input like --train_command 'bash train_1p.sh' or --train_command 'python3 train.py'", + ) + + parser.add_argument( + "--aoe", + default=False, + type=bool, + help="Optional, if or not use AOE, default is False, use --aoe=True to enable", + ) + + parser.add_argument( + "--use_library", + default=False, + type=bool, + help="Optional, if or not training with custom tune bank file witch generated by AOE, default is False, use --use_library=True to enable", + ) + + parser.add_argument( + "--config_file", + default=None, + ) + + parser.add_argument( + "--train_log_dir", + default="", + type=str, + ) + + parser.add_argument( + "--device_id", + default=0, + type=int, + ) + + parser.add_argument( + "--rank_nums", + default=2, + type=int, + ) + + parser.add_argument( + "--start_rank_id", + default=0, + type=int, + ) + + parser.add_argument( + "--multi_worker", + action="store_true", + ) + + parser.add_argument( + "--rank_size", + default=8, + type=int, + ) + + parser.add_argument( + "--worker_num", + default=0, + type=int, + ) + + parser.add_argument( + "--use_config", + action="store_true", + ) + + parser.add_argument( + "--command_list", + default=None, + type=str, + ) + + parser.add_argument( + "--server_list", + type=None, + ) + + return parser \ No newline at end of file diff --git a/Tools/ascend_distribute/config.py b/Tools/ascend_distribute/config.py new file mode 100644 index 000000000..1b188fc17 --- /dev/null +++ b/Tools/ascend_distribute/config.py @@ -0,0 +1,150 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -*- coding: utf-8 -*- +import os + +from config_dist import * +from config_utils import _default_train_pattern +from multi_utils import get_default_config, exits_default_config, get_server_info, write_to_json +from config_args import cache_dir + + +def deal_host_info(server_info, ids, host, env_device_nums): + + host_list = host.split(":") + env_device_nums += int(host_list[1]) + server_id = 'server' + str(ids) + if len(host_list) == 2 or len(host_list) == 3: + if len(host_list) == 3: + print(int(host_list[1])) + if int(host_list[1]) % 2 == 0: + device_list = list(map(int, list(host_list[-1]))) + username, password = get_server_info(host_list[0]) + if len(device_list) == int(host_list[1]): + server_info[server_id] = { + 'server_ip': host_list[0], + 'server_username': username, + 'server_password': password, + 'device_list': device_list + } + else: + raise ValueError("The information before and after the device dose not match") + + else: + if int(host_list[-1]) == 8: + + device_list = [i for i in range(8)] + username, password = get_server_info(host_list[0]) + server_info[server_id] = { + 'server_ip': host_list[0], + 'server_username': username, + 'server_password': password, + 'device_list': device_list + } + else: + raise ValueError("When the number of devices is not 8, give the specific device") + else: + raise ValueError("Incorrect information") + + return server_info, env_device_nums + + +def get_train_pattern(args): + device_nums = int(args.np) + if ',' in args.env: + server_info = {} + hosts_list = args.env.split(',') + if len(hosts_list) % 2 != 0: + raise ValueError("Multi worker must be a multiple of 2") + else: + env_device_nums = 0 + for ids, host in enumerate(hosts_list): + server_info, env_device_nums = deal_host_info(server_info, ids, host, env_device_nums) + + if device_nums == env_device_nums: + return _default_train_pattern(1), server_info + else: + server_info = {} + env_device_nums = 0 + server_info, env_device_nums = deal_host_info(server_info, 0, args.env, env_device_nums) + print(device_nums == env_device_nums) + if device_nums == env_device_nums: + return _default_train_pattern(0), server_info + else: + raise ValueError("device nums != len(device_list)") + + +def analyze_user_input(args): + """ + + :param args: + "return" + """ + train_pattern, env_info = get_train_pattern(args) + if train_pattern == "OneWorker OneDevice": + make_onedevice_config(args, env_info) + elif train_pattern == "OneWorker MultiDevice": + make_multidevice_config(args, env_info) + elif train_pattern == "MultiWorker MultiDevice": + make_multiworker_multidevice_config(args, env_info) + elif train_pattern == "Host2Host": + make_h2h_config(args, env_info) + else: + pass + + +def save_server_info(args): + config = args.config + if ',' in config: + server_list = config.split(',') + else: + server_list = [config] + + server_dict = dict() + print(server_dict) + for server in server_list: + server_info = server.split(':') + if server_info[0] not in server_dict: + server_dict[server_info[0]] = {} + server_dict[server_info[0]]["server_username"] = server_info[1] + server_dict[server_info[0]]["server_password"] = server_info[2] + file_name = 'env_config.json' + write_to_json(file_name, server_dict) + + +def put_process_monitor(): + filepath = os.path.dirname(__file__) + filename = filepath + '/process_monitor.py' + os.system(f"cp {filename} {cache_dir}") + + +def config_command(args): + put_process_monitor() + analyze_user_input(args) \ No newline at end of file diff --git a/Tools/ascend_distribute/config_args.py b/Tools/ascend_distribute/config_args.py new file mode 100644 index 000000000..b104e86e1 --- /dev/null +++ b/Tools/ascend_distribute/config_args.py @@ -0,0 +1,108 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import yaml + +from enum import Enum +from dataclasses import dataclass + + +cache_home = os.path.expanduser("~/ascend_tools") +cache_dir = os.path.join(cache_home, "ascend_distribute/") +library_path = os.path.join(cache_dir, "custom_tune_bank") +if not os.path.exists(library_path): + os.makedirs(library_path) +listnames = os.listdir(library_path) +bank_path=None +if listnames: + for names in listnames: + if "gradient_fusion" in names: + bank_path = os.path.join(library_path, names) +default_config_file = os.path.join(cache_dir, "ascend_distribute_config.yaml") +default_rank_table_file = os.path.join(cache_dir, "rank_table.json") + + +def load_config_from_file(config_file): + config_file_exists = config_file is not None and os.path.isfile(config_file) + config_file = config_file if config_file_exists else default_config_file + + with open(config_file, "r", encoding="utf-8") as f: + config_class = MultiWorkerConfig + return config_class.from_yaml_file(yaml_file=config_file) + + +@dataclass +class BaseConfig: + device_id: int + + def to_dict(self): + result = self.__dict__ + + for key, value in result.items(): + if isinstance(value, Enum): + result[key] = value.value + return result + + def to_yaml_file(self, yaml_file): + with open(yaml_file, "w", encoding="utf-8") as f: + yaml.safe_dump(self.to_dict(), f) + + @classmethod + def from_yaml_file(cls, yaml_file=None): + yaml_file = default_config_file if yaml_file is None else yaml_file + with open(yaml_file, "r", encoding="utf-8") as f: + config_dict = yaml.safe_load(f) + print(config_dict) + if "start_rank_id" in config_dict: + config_dict['multi_worker'] = True + return cls(**config_dict) + + def __post_init__(self): + pass + + +@dataclass +class MultiDeviceConfig(BaseConfig): + rank_nums: int + device_list: list + rank_size: int = 8 + + def __post_init__(self): + return super().__post_init__() + + +@dataclass +class MultiWorkerConfig(BaseConfig): + start_rank_id: int = 0 + multi_worker: bool = False + worker_num: int = 1 + + def __post_init__(self): + return super().__post_init__() \ No newline at end of file diff --git a/Tools/ascend_distribute/config_dist.py b/Tools/ascend_distribute/config_dist.py new file mode 100644 index 000000000..8ca4ad732 --- /dev/null +++ b/Tools/ascend_distribute/config_dist.py @@ -0,0 +1,106 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -*- coding: utf-8 -*- +import os +import re +import json + +from config_utils import _default_device_ip, _default_server_id +from config_args import BaseConfig, MultiDeviceConfig, MultiWorkerConfig +from multi_utils import get_worker_device_ip, create_rank_table, check_multi_communication, save_rank_table +from multi_utils import save_default_config + + +def make_onedevice_config(args, env_info): + pass + + +def make_multidevice_config(args, env_dict): + if isinstance(env_dict, dict): + pass + else: + raise ValueError("") + device_list = env_dict['server0']['device_list'] + device_ip = _default_device_ip() + server_id = _default_server_id() + + server_info = { + 'device_ip': device_ip, + 'server_id': server_id, + 'device_list': device_list + } + + create_rank_table(server_info) + server_list = [] + env_dict['server0']['device_ip'] = None + env_dict['server0']['rank_id'] = None + args.rank_size = len(device_list) + server_list.append(env_dict['server0']) + save_default_config(len(device_list), server_list) + args.server_list = server_list + args.worker_num = 1 + + +def make_multiworker_multidevice_config(args, env_dict): + if isinstance(env_dict, dict): + pass + else: + pass + + worker_num = len(env_dict) + server_list = [] + device_ids = [] + for i in range(0, worker_num): + server_id = 'server' + str(i) + device_ids.append(len(env_dict[server_id]['device_list'])) + server_list.append(env_dict[server_id]) + + rank_id_list = [0] + for device_id in device_ids[:-1]: + rank_id_list.append(rank_id_list[-1] + device_id) + rank_size = sum(device_ids) + + for i in range(worker_num): + device_ip = get_worker_device_ip(server_list[i]) + server_list[i]['device_ip'] = device_ip + server_list[i]['rank_id'] = rank_id_list[i] + + check_multi_communication(server_list) + save_default_config(rank_size, server_list) + save_rank_table(rank_size, server_list) + + args.rank_size = rank_size + args.multi_worker = True + args.worker_num = worker_num + args.server_list = server_list + + +def make_h2h_config(args, env_info): + pass \ No newline at end of file diff --git a/Tools/ascend_distribute/config_utils.py b/Tools/ascend_distribute/config_utils.py new file mode 100644 index 000000000..c8aa8e8c3 --- /dev/null +++ b/Tools/ascend_distribute/config_utils.py @@ -0,0 +1,50 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +def _default_device_ip(): + default_device_ip = [ + '192.168.100.100', + '192.168.101.100', + '192.168.102.100', + '192.168.103.100', + '192.168.100.101', + '192.168.101.101', + '192.168.102.101', + '192.168.103.101', + ] + return default_device_ip + + +def _default_server_id(): + default_server_ip = "10.147.179.27" + return default_server_ip + + +def _default_train_pattern(index): + return ["OneWorker MultiDevice", "MultiWorker MultiDevice"][index] \ No newline at end of file diff --git a/Tools/ascend_distribute/distribute_npu.py b/Tools/ascend_distribute/distribute_npu.py new file mode 100644 index 000000000..bcebe77b0 --- /dev/null +++ b/Tools/ascend_distribute/distribute_npu.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from common_flags import define_ascend_distribute_flags +from config import config_command, save_server_info +from train import run_command + + +def main(): + parser = define_ascend_distribute_flags() + args, unknow_args = parser.parse_known_args() + if args.config is not None: + save_server_info(args) + else: + config_command(args) + run_command(args) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/Tools/ascend_distribute/multi_utils.py b/Tools/ascend_distribute/multi_utils.py new file mode 100644 index 000000000..94a84cca9 --- /dev/null +++ b/Tools/ascend_distribute/multi_utils.py @@ -0,0 +1,248 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import time +import json + +import paramiko + +from config_args import cache_dir, default_rank_table_file, default_config_file, library_path, bank_path + + +def get_default_hccn_conf(): + hccn_conf = os.path.join("/etc", "hccn.conf") + + if os.path.isfile(hccn_conf) and os.path.getsize(hccn_conf) != 0: + default_hccn_conf = hccn_conf + else: + default_hccn_conf = None + return default_hccn_conf + + +def ssh_server(server_info): + ssh = paramiko.SSHClient() + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + + try: + ssh.connect(hostname=server_info['server_ip'], port=22, + username=server_info['server_username'], password=server_info['server_password']) + return ssh + except Exception as e: + print(e) + return + + +def get_worker_device_ip(server_info): + device_ip = [] + ssh = ssh_server(server_info) + default_hccn_conf = get_default_hccn_conf() + + if default_hccn_conf: + ssh_in, ssh_out, ssh_error = ssh.exec_command(f"cat {default_hccn_conf}") + result = ssh_out.read() or ssh_error.read() + result = result.decode().strip().split('\n') + for i in range(8): + address = 'address_' + str(i) + for info in result: + if address in info: + device_ip.append(info.split("=")[-1]) + + ssh.exec_command(f"mkdir -p {cache_dir}") + ssh.exec_command(f"mkdir -p {library_path}") + ssh.close() + return device_ip + else: + print("Not found /etc/hccn.conf, please prepare /etc/hccn.conf.") + + +def write_to_json(file_name, dict): + if not os.path.isdir(cache_dir): + os.makedirs(cache_dir) + json_dict = json.dumps(dict) + json_file = os.path.join(cache_dir, file_name) + f = open(json_file, 'w') + f.write(json_dict) + f.close() + return json_file + + +def table_dict(server_count, server_list_json): + return { + "server_count": str(server_count), + "server_list": server_list_json, + "status": "completed", + "version": "1.0" + } + + +def create_rank_table(server_info): + rank_id_start = 0 + server_count = 1 + rank_size = len(server_info['device_list']) + device = [] + server_list_json = [] + + for dev in server_info['device_list']: + rank_id = rank_id_start + rank_id_start += 1 + device.append({"device_id": str(dev), "device_ip": str(server_info['device_ip'][dev]), "rank_id": str(rank_id)}) + server_list_json.append({"server_id": str(server_info['server_id']), "device": device}) + + rank_table_dict = table_dict(server_count, server_list_json) + file_name = 'rank_table_' + str(server_count) + '_' + str(rank_size) + 'p.json' + write_to_json(file_name, rank_table_dict) + + +def check_sub_server(server): + server_0 = server[0] + server_1 = server[1] + + def check_each(host_server, target_server): + ssh_host = ssh_server(host_server) + for i in range(len(target_server['device_ip'])): + ssh_host.exec_command(f"cat /e hccn_tool -i {i} -netdetect -s address {target_server['device_ip'][i]}") + + time.sleep(5) + ssh_in, ssh_out, ssh_error = ssh_host.exec_command(f"hccn_tool -i {i} -net_health -g") + result = ssh_out.read() or ssh_error.read() + result = result.decode().strip() + if "Success" in result: + print(f"{host_server['server_ip']} device {i} ->" + f"{target_server['server_ip']} device {i} Success!!!") + ssh_host.close() + + # A -> B + check_each(server_0, server_1) + # B -> A + check_each(server_1, server_0) + + +def check_multi_communication(server_list): + double_server = [(server_list[i], server_list[j]) + for i in range(len(server_list)) + for j in range(len(server_list)) if i < j] + + for server in double_server: + check_sub_server(server) + + +def sftp_server(server_info): + transport = paramiko.Transport((server_info['server_ip'], 22)) + try: + transport.connect(username=server_info['server_username'], password=server_info['server_password']) + except Exception as e: + print(e) + return + + sftp = paramiko.SFTPClient.from_transport(transport) + return transport, sftp + + +def put_rank_table(sftp, rank_table_file): + sftp.put(rank_table_file, rank_table_file) + + +def upload_table_file(server_list, rank_table_file): + for server in server_list[1:]: + transport, sftp = sftp_server(server) + put_rank_table(sftp, rank_table_file) + transport.close() + + +def get_server_rank(server): + server_id = server["server_ip"] + rank_id_start = server['rank_id'] + device = [] + for dev in server['device_list']: + rank_id = rank_id_start + rank_id_start += 1 + device.append({"device_id": str(dev), "device_ip": str(server['device_ip'][dev]), "rank_id": str(rank_id)}) + + return {"server_id": str(server_id), "device": device} + + +def save_rank_table(rank_size, server_list): + server_count = len(server_list) + + server_list_json = [] + for server in server_list: + server_list_json.append(get_server_rank(server)) + + rank_table_dict = table_dict(server_count, server_list_json) + file_name = 'rank_table_' + str(server_count) + '_' + str(rank_size) + 'p.json' + rank_table_file = write_to_json(file_name, rank_table_dict) + process_file = cache_dir + 'process_monitor.py' + upload_table_file(server_list, rank_table_file) + upload_table_file(server_list, process_file) + if bank_path is not None and os.path.exists(bank_path): + upload_table_file(server_list, bank_path) + + +def save_default_config(rank_size, server_list): + server_count = len(server_list) + server_dict = {'server_info': server_list} + file_name = 'default_config_' + str(server_count) + '_' + str(rank_size) + 'p.json' + write_to_json(file_name, server_dict) + + +def get_default_config(config_file, multi_worker=True): + with open(config_file, "r") as config: + load_dict = json.load(config) + if multi_worker: + server_info = {} + server_list = load_dict['server_info'] + for ids, server in enumerate(server_list): + server_id = 'server' + str(ids) + server_info[server_id] = { + 'server_ip': server['server_ip'], + 'server_username': server['server_username'], + 'server_password': server['server_password'], + 'device_list': server['device_list'] + } + else: + server_info = load_dict['server_info'][0]['device_list'] + return server_info + + +def exits_default_config(config_file): + return os.path.exists(config_file) + + +def get_server_info(host_ip): + config_file = cache_dir + "/env_config.json" + if exits_default_config(config_file): + with open(config_file, "r") as config: + try: + load_dict = json.load(config) + return load_dict[host_ip]['server_username'], load_dict[host_ip]['server_password'] + except Exception as e: + print(e) + return \ No newline at end of file diff --git a/Tools/ascend_distribute/process_manage.py b/Tools/ascend_distribute/process_manage.py new file mode 100644 index 000000000..bf5da3f39 --- /dev/null +++ b/Tools/ascend_distribute/process_manage.py @@ -0,0 +1,75 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import sys +import time + +from config_args import cache_dir +from multi_utils import ssh_server + + +class ProcessManage: + def __init__(self, args): + self.args = args + + def sub_process_monitor(self): + for i in range(1, self.args.worker_num): + worker_info = self.args.server_list[i] + ssh = ssh_server(worker_info) + ssh_in, ssh_out, ssh_error = ssh.exec_command("pgrep -f RANK_TABLE_FILE") + result = ssh_out.read().decode() or ssh_error.read().decode() + process = result.splitlines() + if len(process) == len(worker_info['device_list']): + print(f"{worker_info['server_ip']} has running {len(process)} Process!") + + def host_process_monitor(self): + result = os.popen("pgrep -f distribute_npu") + process = result.read().splitlines() + if len(process)-2 == self.args.rank_size: + print(f"Have Run {len(process)-2} processes!") + else: + print(f"Only found {len(process)-2} processes, not equal rank_size!") + + i = 0 + while len(process) - 2: + i += 1 + time.sleep(30) + result = os.popen("pgrep -f distribute_npu") + process = result.read().splitlines() + if i % 20 == 0: + self.sub_process_monitor() + + def after_treatment(self, signal, frame): + for i in range(1, self.args.worker_num): + worker_info = self.args.server_list[i] + ssh = ssh_server(worker_info) + ssh_in, ssh_out, ssh_error = ssh.exec_command(f"python3 {cache_dir}/process_monitor.py") + result = ssh_out.read().decode() or ssh_error.read().decode() + sys.exit(0) \ No newline at end of file diff --git a/Tools/ascend_distribute/process_monitor.py b/Tools/ascend_distribute/process_monitor.py new file mode 100644 index 000000000..8966a8124 --- /dev/null +++ b/Tools/ascend_distribute/process_monitor.py @@ -0,0 +1,95 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import signal + + +def get_parent_process(): + result = os.popen("pgrep -f RANK_TABLE_FILE") + result = result.read() + parent_process = [] + for line in result.splitlines(): + parent_process.append([int(line)]) + return parent_process + + +def get_all_process(): + all_process = os.popen("ps -ef") + all_process = all_process.read() + sub_parent = [] + for process in all_process.splitlines(): + process_info = list(filter(lambda x:x, process.split(' '))) + + try: + sub_parent.append([int(process_info[1]), int(process_info[2])]) + except ValueError as e: + continue + + sub_parent_dict = {} + for p in sub_parent: + if p[1] in sub_parent_dict: + sub_parent_dict[p[1]].append(p[0]) + else: + sub_parent_dict[p[1]] = [p[0]] + + return sub_parent_dict + + +def find_sub_process(process, sub_process, sub_parent_dict): + temp = [] + for p_pid in process: + sub_process.add(p_pid) + if p_pid in sub_parent_dict: + for s_pid in sub_parent_dict[p_pid]: + sub_process.add(s_pid) + temp.append(s_pid) + else: + return + find_sub_process(temp, sub_process, sub_parent_dict) + + +def find_all_process(parent_process, sub_parent_dict): + all_sub_parent = [] + for process in parent_process: + sub_process = set() + find_sub_process(process, sub_process, sub_parent_dict) + all_sub_parent.append(sub_process) + + return all_sub_parent + + +def main(): + parent_process = get_parent_process() + sub_parent_dict = get_all_process() + all_sub_parent = find_all_process(parent_process, sub_parent_dict) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/Tools/ascend_distribute/requirements.txt b/Tools/ascend_distribute/requirements.txt new file mode 100644 index 000000000..420f45a32 --- /dev/null +++ b/Tools/ascend_distribute/requirements.txt @@ -0,0 +1,2 @@ +yaml +paramiko \ No newline at end of file diff --git a/Tools/ascend_distribute/train.py b/Tools/ascend_distribute/train.py new file mode 100644 index 000000000..f9665a190 --- /dev/null +++ b/Tools/ascend_distribute/train.py @@ -0,0 +1,155 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import argparse +import sys +import signal +from multiprocessing import Process + +from config_args import cache_dir, default_config_file, load_config_from_file, library_path, bank_path +from multi_utils import ssh_server +from process_manage import ProcessManage + + +def create_train_command(args, ids, device_id, i=0, ssh=None, exec_path=None): + if not ssh: + os.environ['RANK_ID'] = f'{ids + args.start_rank_id}' + os.environ['ASCEND_DEVICE_ID'] = f'{device_id}' + + return args.train_command + + +def run_train_command(args, ids, device, i=0, ssh=None, exec_path=None): + train_command = create_train_command(args, ids, device, i=i, ssh=ssh) + print(os.getpid(), os.getppid()) + os.system(train_command) + + +def run_multi_command(args, ids, device, worker_info, i=0, exec_path=None): + ssh = ssh_server(worker_info) + train_command = create_train_command(args, ids, device, i=i, ssh=ssh, exec_path=exec_path) + rank_table_file = 'rank_table_' + str(args.worker_num) + '_' + str(args.rank_size) + 'p.json' + source_env1 = 'source /usr/local/Ascend/bin/setenv.bash' + source_env2 = 'source /usr/local/Ascend/latest/bin/setenv.bash' + source_env3 = 'source ~/.bashrc' + RANK_TABLE_FILE = cache_dir + rank_table_file + RANK_ID = ids + args.server_list[i]['rank_id'] + rank_table_command = f"export RANK_TABLE_FILE={RANK_TABLE_FILE}" + rank_id_command = f"export RANK_ID={RANK_ID}" + rank_size_command = f"export RANK_SIZE={args.rank_size}" + device_id_command = f"export ASCEND_DEVICE_ID={device}" + get_path = f"cd {exec_path}" + if args.use_library: + library_command = f"export TUNE_BANK_PATH={library_path}" + ssh_in, ssh_out, ssh_error = ssh.exec_command(f"{rank_table_command};{rank_id_command};{device_id_command};{library_command};" + f"{rank_size_command};{get_path};{source_env1};{source_env2};{source_env3};{train_command}") + else: + ssh_in, ssh_out, ssh_error = ssh.exec_command(f"{rank_table_command};{rank_id_command};{device_id_command};" + f"{rank_size_command};{get_path};{source_env1};{source_env2};{source_env3};{train_command}") + result = ssh_out.read().decode() or ssh_error.read().decode() + print("result=", result) + ssh.close() + + +def npu_distribute_run(args, process_list): + if args.rank_nums != len(args.server_list[0]['device_list']): + print("rank_nums != len(device_list), use len(device_list)!") + os.environ['RANK_SIZE'] = f'{args.rank_size}' + rank_table_file = 'rank_table_' + str(args.worker_num) + '_' + str(args.rank_size) + 'p.json' + os.environ['RANK_TABLE_FILE'] = cache_dir + rank_table_file + + if args.aoe: + os.environ['AOE_MODE'] = '4' + if not os.path.exists(library_path): + os.mkdir(library_path) + os.environ['TUNE_BANK_PATH'] = library_path + p = Process(target=run_train_command, args=(args, 0, 0)) + p.start() + p.join() + + else: + if args.use_library: + os.environ['TUNE_BANK_PATH'] = library_path + for ids, device in enumerate(args.server_list[0]['device_list']): + p = Process(target=run_train_command, args=(args, ids, device)) + p.start() + process_list.append(p) + + +def aoe_check(): + if not os.popen('lspci').readlines(): + raise ValueError("no lspci command") + + if not os.popen('aoe').readlines(): + raise ValueError("no aoe command") + + +def npu_multi_worker_run(i, args, exec_path, process_list): + worker_info = args.server_list[i] + for ids, device in enumerate(worker_info['device_list']): + p = Process(target=run_multi_command, args=(args, ids, device, worker_info, i, exec_path)) + p.start() + process_list.append(p) + + +def run_command(args): + pm = ProcessManage(args) + signal.signal(signal.SIGINT, pm.after_treatment) + if not args.train_command: + raise ValueError("'--train_command' is must") + + if args.aoe is True and args.use_library is True: + raise ValueError("cannot apply '--aoe' and '--use_library' at the same time!") + + if args.aoe: + aoe_check() + + if args.use_library and not os.path.exists(bank_path): + raise ValueError("no custom tune bank file, please use '--aoe=True' to generate custom tune bank") + + if args.worker_num > 1: + exec_path = os.getcwd() + process_list = [] + for i in range(args.worker_num): + if i == 0: + npu_distribute_run(args, process_list) + else: + npu_multi_worker_run(i, args, exec_path, process_list) + + monitor_process = Process(target=pm.host_process_monitor) + monitor_process.start() + process_list.append(monitor_process) + for p in process_list: + p.join() + else: + process_list=[] + npu_distribute_run(args, process_list) + for p in process_list: + p.join() \ No newline at end of file -- Gitee From 60bdc2e9231c01b4bc7b549544c0dc8ba8709b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 02:21:08 +0000 Subject: [PATCH 04/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20To?= =?UTF-8?q?ols/ascend=5Fdistribute/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Tools/ascend_distribute/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 Tools/ascend_distribute/.keep diff --git a/Tools/ascend_distribute/.keep b/Tools/ascend_distribute/.keep deleted file mode 100644 index e69de29bb..000000000 -- Gitee From f0edb7a87a7413e54efc42fc8c41d36c34bc495f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 08:57:25 +0000 Subject: [PATCH 05/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20To?= =?UTF-8?q?ols/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Tools/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 Tools/.keep diff --git a/Tools/.keep b/Tools/.keep deleted file mode 100644 index e69de29bb..000000000 -- Gitee From b4cff5df442383860a7c07adc2379a7cb73f3645 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 08:57:36 +0000 Subject: [PATCH 06/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20To?= =?UTF-8?q?ols/ascend=5Fdistribute/README.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Tools/ascend_distribute/README.md | 155 ------------------------------ 1 file changed, 155 deletions(-) delete mode 100644 Tools/ascend_distribute/README.md diff --git a/Tools/ascend_distribute/README.md b/Tools/ascend_distribute/README.md deleted file mode 100644 index 988cfb85c..000000000 --- a/Tools/ascend_distribute/README.md +++ /dev/null @@ -1,155 +0,0 @@ -# 分布式插件使用指南 - -## 1、分布式插件简介 - -本插件主要提供了基于昇腾AI处理器快速拉起分布式训练的方法,用户只需要输入单卡训练指令即可快速进行分布式训练,同时加入了AOE分布式梯度调优功能,用户可以在完成分布式训练后直接进行调优以提升分布式训练的性能 - - - -### 1.1、使用约束 - -本插件仅适用于TensorFlow1.X/2.X框架的训练网络 - -使用前请确保网络单卡训练指令可以正常在昇腾AI处理器上进行训练 - - - -### 1.2、环境准备 - -#### 1.2.1、训练环境准备 - -硬件环境和运行环境请参见《[CANN软件安装指南](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=installation-update)》 - - - -#### 1.2.2、插件依赖安装 - -运行以下命令安装分布式工具所需依赖 - -``` -pip3 install requirements.txt -``` - - - -#### 1.2.3、AOE工具安装 - -AOE工具的下载和安装请参见《AOE工具使用指南》,如需使用分布式插件中的AOE梯度调优功能请按照该指南中的说明进行AOE工具的下载和安装,如仅需调用知识库或不进行梯度调优则可以不安装使用AOE工具 - - - -## 2、分布式插件的使用 - -### 2.1、使用分布式插件运行单机或多机分布式训练 - -#### 2.1.1、使用流程 - -**单机多卡分布式训练** - -启动单机多卡训练时,用户通过--np参数指定总的训练卡数,用--env参数指定{服务器IP}:{device数量}:{device id}, - -**多机多卡分布式训练** - -启动多机多卡训练时,用户通过--np参数指定总的训练卡数,用--env参数指定{服务器IP}:{device数量}:{device id},多台机器间使用“,”进行分隔 - -#### 2.1.2、快速上手 - -**单机多卡分布式训练** - -使用以下单机8卡示例命令拉起单机多卡分布式训练 - -说明:--np 8设置在8卡上训练,--env {ip}:{device数量}:{device id}设置在指定IP服务器上的8张卡上进行训练,当设置8卡时可以不写:{device id},详细见《分布式插件参数说明章节》 - -``` -python3 distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_1p.sh --data_path=/npu/traindata" -``` - - - -使用以下单机4卡示例命令拉起单机非8卡分布式训练 - -说明:需要在--env 10.10.10.10后添加“:{device数量}:{device ID}”以指定使用的device数量和使用的device ID - -``` -python3 distrbute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train_1p.sh --data_path=/npu/traindata" -``` - - - -**多机多卡分布式训练** - -使用以下多机16卡示例命令拉起多机多卡分布式训练 - -说明:类似于单机多卡训练,--env 参数中不同机器用“,”分隔 - -``` -python3 distrbute_npu.py --np 8 --env 10.10.10.10:4:0123,10.10.10.11:4:0123 --train_command "bash train_1p.sh --data_path=/npu/traindata" -``` - - - -### 2.2、使用AOE工具进行分布式梯度切分调优 - -#### 2.2.1、调优流程 - -**使用AOE生成自定义知识库** - -当用户可以拉起单机多卡分布式训练后,可以开启AOE梯度调优,仅需在拉起单机分布式的命令后加一个--aoe=True的参数即可。执行该命令后,会默认在device0上拉起单个进程进行梯度调优,梯度调优结束后会生成一个{芯片名}_gradient_fusion.json的自定义知识库,例如Ascend910A_gradient_fusion.json - -说明:对于一个网络的某一个场景,AOE只用调优一次;对于已经进行过AOE梯度调优的网络,无需再次进行AOE - - - -**使用生成的知识库** - -AOE调优完毕后,会生成一个自定义知识库文件,通过环境变量调用知识库进行分布式训练 - - - -#### 2.2.2、快速上手 - -**使用AOE生成自定义知识库** - -使用以下示例中的命令进行AOE调优 - -``` -python3 distribute_npu.py --np 8 train_command "bash train_1p.sh --data_path=/npu/traindata" --aoe=True -``` - -说明:AOE调优前需确保该命令可以进行分布式训练,在可执行分布式训练的命令后添加 --aoe=True即可 - - - -**使用/禁用调优后生成的知识库** - -调优完毕后再次拉起分布式训练即可调用自定义知识库,当进行多机训练时会自动将自定义知识库传输到其他机器上 - -如果用户不想调用自定义知识库时可以按照以下示例在训练命令后添加 --use_library=False禁用知识库 - -``` -python3 distribute_npu.py --np 8 train_command "bash train_1p.sh --data_path=/npu/traindata" --use_library=False -``` - - - -## 3、常见问题处理 - -残留进程处理 - -报错 - - - -## 4、分布式插件参数说明 - -| 参数名 | 默认值 | 类型 | 参数说明 | -| --------------- | ------ | ---- | ------------------------------------------------------------ | -| -h 或 --help | 无 | 无 | 打印帮助信息,使用python3 distribute_npu.py --help打开帮助信息 | -| --env | None | 必须 | 环境信息,按照ip:device数量:device_id的格式进行输入,多机时请用','进行分隔。示例:--env 10.10.10.10:4:0123,10.10.10.11:4:1234 | -| --np | 8 | 必须 | 总共使用的device数量,默认为8卡。示例:--np 16 | -| --train_command | None | 必须 | 启动单卡训练的指令。示例:--train_command "bash train_1p.sh --data_path=/home/data" | -| --aoe | False | 可选 | 是否使用AOE工具进行分布式梯度调优,默认为False。使用 --aoe=True启动 | -| --use_library | | 可选 | 是否使用AOE调优生成的知识库,默认为True,当用户 | - - - -- Gitee From 28097b1afbf6904418a8fe03546f923afabebb9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 08:58:03 +0000 Subject: [PATCH 07/11] add readme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 傅国鑫 <745324460@qq.com> --- Tools/ascend_distribute/README.md | 168 ++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 Tools/ascend_distribute/README.md diff --git a/Tools/ascend_distribute/README.md b/Tools/ascend_distribute/README.md new file mode 100644 index 000000000..a508d7157 --- /dev/null +++ b/Tools/ascend_distribute/README.md @@ -0,0 +1,168 @@ +# 分布式插件使用指南 + +## 1 分布式插件简介 + +工具主要用于基于昇腾AI处理器快速拉起分布式训练,简化了分布式参数配置,用户只需要输入单卡训练指令即可快速进行分布式训练,同时加入了AOE分布式梯度调优功能,用户可以在完成分布式训练后直接进行调优以提升分布式训练的性能 + +### 1.1 工具获取 + +1. 下载压缩包的方式获取 将https://gitee.com/ascend/ModelZoo-TensorFlow 以压缩包形式下载 +2. 使用git命令方式获取 +3. 移动 Tools/ascend_distribute 目录至常用公共路径 + +### 1.2 使用约束 + +- 本插件仅适用于TensorFlow1.X/2.X框架的训练网络 + +- 使用前请确保网络单卡训练指令可以正常在昇腾AI处理器上进行训练,且分布式训练代码已修改 +- 执行脚本和训练代码中不要设置任何分布式环境变量,包括但不限于:ASCEND_DEVICE_ID,RANK_TABLE_FILE,RANK_ID,RANK_SIZE... +- 多机训练时,请保证每个服务器的训练代码路径、数据集路径、脚本参数一致 + +### 1.3 已完成功能 + +- 自动根据传入的分布式参数生成对应的RANK_TABLE_FILE +- 自动设置执行分布式需要设置的环境变量 +- 对拉起的进程每10分钟检测一次 +- 多服务器训练时只需在一个服务器下发任务 + + + +### 1.2 环境准备 + +#### 1.2.1 训练环境准备 + +硬件环境和运行环境请参见《[CANN软件安装指南](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=installation-update)》 + + + +#### 1.2.2 插件依赖安装 + +运行以下命令安装分布式工具所需依赖 + +``` +pip3 install requirements.txt +``` + + + +#### 1.2.3 AOE工具安装 + +AOE工具的下载和安装请参见《AOE工具使用指南》,如需使用分布式插件中的AOE梯度调优功能请按照该指南中的说明进行AOE工具的下载和安装,如仅需调用知识库或不进行梯度调优则可以不安装使用AOE工具 + + + +## 2 分布式插件的使用 + +### 2.1 使用分布式插件运行单机或多机分布式训练 + +#### 2.1.1 使用流程 + +**环境初始化** + +为了实现多机可以跨服务器拉起,执行训练之前,请完成环境配置,相关文件保存在```~/ascend_tools/ascend_distribute``` 目录下,已配置无需重复配置,华为不会记录你的任何环境信息 + +``` +# config传入参数格式 {ip}:{username}:{password} 多个环境之间用','隔开 + +python3 distrbute_npu.py --config 10.10.10.10:root:huawei,10.10.10.11:root:huawei +``` + + + +**单机多卡分布式训练** + + + +说明:```--np n``` 设置在n卡上训练,```--env {ip}:{device数量}:{device id}`` 设置在指定IP服务器上的8张卡上进行训练,当设置8卡时可以不写:{device id},详细见《分布式插件参数说明章节》 + +``` +# 单机8卡 +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_1p.sh --data_path=/npu/traindata" + +# 单机4卡 +python3 $path/distrbute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train_1p.sh --data_path=/npu/traindata" +``` + + + +**多机多卡分布式训练** + +使用以下多机16卡示例命令拉起多机多卡分布式训练 + +说明:类似于单机多卡训练,--env 参数中不同机器用“,”分隔 + +``` +# 两机16卡 +python3 distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_1p.sh --data_path=/npu/traindata" + +# 两机8卡,每个服务器分别在device 0123上执行训练 +python3 distrbute_npu.py --np 8 --env 10.10.10.10:4:0123,10.10.10.11:4:0123 --train_command "bash train_1p.sh --data_path=/npu/traindata" +``` + + + +### 2.2 使用AOE工具进行分布式梯度切分调优 + +#### 2.2.1 调优流程 + +**使用AOE生成自定义知识库** + +当用户可以拉起单机多卡分布式训练后,可以开启AOE梯度调优,仅需在拉起单机分布式的命令后加一个--aoe=True的参数即可。执行该命令后,会默认在device0上拉起单个进程进行梯度调优,梯度调优结束后会生成一个{芯片名}_gradient_fusion.json的自定义知识库,例如Ascend910A_gradient_fusion.json + +说明:对于一个网络的某一个场景,AOE只用调优一次;对于已经进行过AOE梯度调优的网络,无需再次进行AOE + + + +**使用生成的知识库** + +AOE调优完毕后,会生成一个自定义知识库文件,通过环境变量调用知识库进行分布式训练 + + + +#### 2.2.2 快速上手 + +**使用AOE生成自定义知识库** + +使用以下示例中的命令进行AOE调优 + +``` +python3 distribute_npu.py --np 8 train_command "bash train_1p.sh --data_path=/npu/traindata" --aoe=True +``` + +说明:AOE调优前需确保该命令可以进行分布式训练,在可执行分布式训练的命令后添加 --aoe=True即可 + + + +**使用/禁用调优后生成的知识库** + +调优完毕后再次拉起分布式训练即可调用自定义知识库,当进行多机训练时会自动将自定义知识库传输到其他机器上 + +如果用户不想调用自定义知识库时可以按照以下示例在训练命令后添加 --use_library=False禁用知识库 + +``` +python3 distribute_npu.py --np 8 train_command "bash train_1p.sh --data_path=/npu/traindata" --use_library=False +``` + + + +## 3 常见问题处理 + +残留进程处理 + +报错 + + + +## 4 分布式插件参数说明 + +| 参数名 | 默认值 | 类型 | 参数说明 | +| --------------- | ------ | ---- | ------------------------------------------------------------ | +| -h 或 --help | 无 | 无 | 打印帮助信息,使用python3 distribute_npu.py --help打开帮助信息 | +| --env | None | 必须 | 环境信息,按照ip:device数量:device_id的格式进行输入,多机时请用','进行分隔。示例:--env 10.10.10.10:4:0123,10.10.10.11:4:1234 | +| --np | 8 | 必须 | 总共使用的device数量,默认为8卡。示例:--np 16 | +| --train_command | None | 必须 | 启动单卡训练的指令。示例:--train_command "bash train_1p.sh --data_path=/home/data" | +| --aoe | False | 可选 | 是否使用AOE工具进行分布式梯度调优,默认为False。使用 --aoe=True启动 | +| --use_library | | 可选 | 是否使用AOE调优生成的知识库,默认为True,当用户 | + + + -- Gitee From 19d00546c7caedddd077ee04157cb5d2bc7bdd3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 08:59:01 +0000 Subject: [PATCH 08/11] update Tools/ascend_distribute/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 傅国鑫 <745324460@qq.com> --- Tools/ascend_distribute/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/ascend_distribute/README.md b/Tools/ascend_distribute/README.md index a508d7157..3416d1bd8 100644 --- a/Tools/ascend_distribute/README.md +++ b/Tools/ascend_distribute/README.md @@ -59,7 +59,7 @@ AOE工具的下载和安装请参见《AOE工具使用指南》,如需使用 **环境初始化** -为了实现多机可以跨服务器拉起,执行训练之前,请完成环境配置,相关文件保存在```~/ascend_tools/ascend_distribute``` 目录下,已配置无需重复配置,华为不会记录你的任何环境信息 +为了实现多机可以跨服务器拉起,执行训练之前,请完成环境配置,相关文件保存在```~/ascend_tools/ascend_distribute``` 目录下,已配置无需重复配置。 ``` # config传入参数格式 {ip}:{username}:{password} 多个环境之间用','隔开 -- Gitee From 29eb51e1d2265dd4e9d3adee42f3aefc635b3d7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 09:18:29 +0000 Subject: [PATCH 09/11] add train_performance_distribute.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 傅国鑫 <745324460@qq.com> --- .../test/train_performance_distribute.sh | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_distribute.sh diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_distribute.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_distribute.sh new file mode 100644 index 000000000..a4c2d86f3 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_distribute.sh @@ -0,0 +1,191 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=99990001 +export SLOG_PRINT_TO_STDOUT=0 +export HCCL_CONNECT_TIMEOUT=600 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="SSD-Resnet34_ID0048_for_TensorFlow" +#训练epoch +train_epochs=8 +#训练batch_size +batch_size=32 +#训练step +train_steps=1000 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#autotune时,先开启autotune执行单P训练,不需要修改 +if [[ $autotune == True ]]; then + train_full_1p.sh --autotune=$autotune --data_path=$data_path + wait + autotune=False +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" +export DEVICE_INDEX=$RANK_ID +export DEVICE_ID=$ASCEND_DEVICE_ID + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +fi + + # # 绑核,不需要的绑核的模型删除,需要模型审视修改 + # corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` + # let a=RANK_ID*${corenum}/${RANK_SIZE} + # let b=RANK_ID+1 + # let c=b*${corenum}/${RANK_SIZE}-1 + + # #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + # #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + # if [ "x${bind_core}" != x ];then + # bind_core="taskset -c $a-$c" + # fi + # nohup ${bind_core} python3.7 ${cur_path}/../ssd_main.py --mode=train \ +nohup python3.7 ${cur_path}/../ssd_main.py --mode=train \ + --train_batch_size=${batch_size} \ + --training_file_pattern=${data_path}/coco_official_2017/tfrecord/train2017* \ + --resnet_checkpoint=${data_path}/resnet34_pretrain/model.ckpt-28152 \ + --validation_file_pattern=${data_path}/coco_official_2017/tfrecord/val2017* \ + --val_json_file=${data_path}/coco_official_2017/annotations/instances_val2017.json \ + --eval_batch_size=${batch_size} \ + --num_epochs=${train_epochs} \ + --num_examples_per_epoch=64000 \ + --model_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`cat ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep "\] FPS:" | awk -F "FPS: " '{print $2}' | awk -F "," '{print $1}' | tail -n +2 | awk '{sum+=$1} END {print sum/NR}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "\] FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 022f04b2fed71a6d0b7042578db3a779f14a4703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 09:27:43 +0000 Subject: [PATCH 10/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20To?= =?UTF-8?q?ols/ascend=5Fdistribute/common=5Fflags.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Tools/ascend_distribute/common_flags.py | 160 ------------------------ 1 file changed, 160 deletions(-) delete mode 100644 Tools/ascend_distribute/common_flags.py diff --git a/Tools/ascend_distribute/common_flags.py b/Tools/ascend_distribute/common_flags.py deleted file mode 100644 index 1244f6578..000000000 --- a/Tools/ascend_distribute/common_flags.py +++ /dev/null @@ -1,160 +0,0 @@ -# -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -*- coding: utf-8 -*- -import argparse - - -def define_ascend_distribute_flags(): - parser = argparse.ArgumentParser("""NPU Distribute run commond - - ################################################################################################################################################################## - - WARNING: Users only need to focus on '--np', '--env', '--train_command', '--aoe' and '--use_library' parameters, do not change other parameters! - WARNING: Before using this tool, please ensure you can perform one-device training with the Ascend NPU, and ensure using this tool with the same training command! - WARNING: Before using this tool, users need to define a config file. For more details, please see "README.md". - - Users can use this tool easily with the follow examples: - common command format: python3 distribute_npu.py --np (total device mun) --env (ip):(device num):(device id) --train_command "onedevice training command" - for one-worker-multi-devices training: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" - for multi-workers-multi-devices training: python3 distribute_npu.py --np 8 --env 10.10.10.10:4:0123,10.10.10.11:4:0123 --train_command "bash train.sh" - for using AOE tuning tool: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" --aoe=True - for disable the AOE tuned bank file: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" --use_library=False - - ATTENTION: 1. After successful one-worker-multi-devices training, users can train with multi-workers, just need to modify the '--env' parameter. - 2. When setting the '--env', please using ',' to separate different workers, and do not forget to modify the config file which includes env info. - 3. After successful one-worker-multi-devices training, users can tune with the AOE tool, just need to add '--aoe=True' after the previous command. - 4. After AOE, if a 'xx_gradient_fusion.json' file generated in '/root/ascend_tools/ascend_distribute/custom_tune_bank/' directory, AOE is successful. - 5. Using AOE tuned file is default, users can set '--use_library=False' to disable using AOE tuned file. - - ################################################################################################################################################################## - """) - parser.add_argument( - "--config", - default=None, - help="Enter containing server ip:username:password.", - ) - - parser.add_argument( - "--np", - default=8, - type=int, - help="Necessary, the total number of devices used for training.", - ) - - parser.add_argument( - "--env", - default=None, - help="Necessary, environment information, please input with '--env {ip}:{device num}:{device ip}' format, when training with MultiWorker, please use ',' to separate different workers", - ) - - parser.add_argument( - "--train_command", - default=None, - type=str, - help="Necessary, training command, input like --train_command 'bash train_1p.sh' or --train_command 'python3 train.py'", - ) - - parser.add_argument( - "--aoe", - default=False, - type=bool, - help="Optional, if or not use AOE, default is False, use --aoe=True to enable", - ) - - parser.add_argument( - "--use_library", - default=False, - type=bool, - help="Optional, if or not training with custom tune bank file witch generated by AOE, default is False, use --use_library=True to enable", - ) - - parser.add_argument( - "--config_file", - default=None, - ) - - parser.add_argument( - "--train_log_dir", - default="", - type=str, - ) - - parser.add_argument( - "--device_id", - default=0, - type=int, - ) - - parser.add_argument( - "--rank_nums", - default=2, - type=int, - ) - - parser.add_argument( - "--start_rank_id", - default=0, - type=int, - ) - - parser.add_argument( - "--multi_worker", - action="store_true", - ) - - parser.add_argument( - "--rank_size", - default=8, - type=int, - ) - - parser.add_argument( - "--worker_num", - default=0, - type=int, - ) - - parser.add_argument( - "--use_config", - action="store_true", - ) - - parser.add_argument( - "--command_list", - default=None, - type=str, - ) - - parser.add_argument( - "--server_list", - type=None, - ) - - return parser \ No newline at end of file -- Gitee From a9ec51e254a04cf2b06723829b3eee2d21f4ff1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=9B=BD=E9=91=AB?= <745324460@qq.com> Date: Thu, 29 Sep 2022 09:28:10 +0000 Subject: [PATCH 11/11] update ascend_distribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 傅国鑫 <745324460@qq.com> --- Tools/ascend_distribute/common_flags.py | 163 ++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 Tools/ascend_distribute/common_flags.py diff --git a/Tools/ascend_distribute/common_flags.py b/Tools/ascend_distribute/common_flags.py new file mode 100644 index 000000000..50d0b5655 --- /dev/null +++ b/Tools/ascend_distribute/common_flags.py @@ -0,0 +1,163 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -*- coding: utf-8 -*- +import argparse + + +def define_ascend_distribute_flags(): + parser = argparse.ArgumentParser("""NPU Distribute run commond + + ################################################################################################################################################################## + + WARNING: Users only need to focus on '--np', '--env', '--train_command', '--aoe' and '--use_library' parameters, do not change other parameters! + WARNING: Before using this tool, please ensure you can perform one-device training with the Ascend NPU, and ensure using this tool with the same training command! + WARNING: Before using this tool, users need to define a config file. For more details, please see "README.md". + + Users can use this tool easily with the follow examples: + common command format: python3 distribute_npu.py --np (total device mun) --env (ip):(device num):(device id) --train_command "onedevice training command" + for one-worker-multi-devices training: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" + for multi-workers-multi-devices training: python3 distribute_npu.py --np 8 --env 10.10.10.10:4:0123,10.10.10.11:4:0123 --train_command "bash train.sh" + for using AOE tuning tool: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" --aoe=True + for disable the AOE tuned bank file: python3 distribute_npu.py --np 4 --env 10.10.10.10:4:0123 --train_command "bash train.sh" --use_library=False + + ATTENTION: 1. After successful one-worker-multi-devices training, users can train with multi-workers, just need to modify the '--env' parameter. + 2. When setting the '--env', please using ',' to separate different workers, and do not forget to modify the config file which includes env info. + 3. After successful one-worker-multi-devices training, users can tune with the AOE tool, just need to add '--aoe=True' after the previous command. + 4. After AOE, if a 'xx_gradient_fusion.json' file generated in '/root/ascend_tools/ascend_distribute/custom_tune_bank/' directory, AOE is successful. + 5. Using AOE tuned file is default, users can set '--use_library=False' to disable using AOE tuned file. + + ################################################################################################################################################################## + """) + parser.add_argument( + "--config", + default=None, + help="Enter containing server ip:username:password.", + ) + + parser.add_argument( + "--np", + default=8, + type=int, + help="Necessary, the total number of devices used for training.", + ) + + parser.add_argument( + "--env", + default=None, + help="Necessary, environment information, please input with '--env {ip}:{device num}:{device ip}' format, " + "when training with MultiWorker, please use ',' to separate different workers", + ) + + parser.add_argument( + "--train_command", + default=None, + type=str, + help="Necessary, training command, input like --train_command 'bash train_1p.sh' or " + "--train_command 'python3 train.py'", + ) + + parser.add_argument( + "--aoe", + default=False, + type=bool, + help="Optional, if or not use AOE, default is False, use --aoe=True to enable", + ) + + parser.add_argument( + "--use_library", + default=False, + type=bool, + help="Optional, if or not training with custom tune bank file witch generated by AOE, default is False, " + "use --use_library=True to enable", + ) + + parser.add_argument( + "--config_file", + default=None, + ) + + parser.add_argument( + "--train_log_dir", + default="", + type=str, + ) + + parser.add_argument( + "--device_id", + default=0, + type=int, + ) + + parser.add_argument( + "--rank_nums", + default=2, + type=int, + ) + + parser.add_argument( + "--start_rank_id", + default=0, + type=int, + ) + + parser.add_argument( + "--multi_worker", + action="store_true", + ) + + parser.add_argument( + "--rank_size", + default=8, + type=int, + ) + + parser.add_argument( + "--worker_num", + default=0, + type=int, + ) + + parser.add_argument( + "--use_config", + action="store_true", + ) + + parser.add_argument( + "--command_list", + default=None, + type=str, + ) + + parser.add_argument( + "--server_list", + type=None, + ) + + return parser \ No newline at end of file -- Gitee