diff --git a/tutorials/source_en/advanced_use/distributed_training.md b/tutorials/source_en/advanced_use/distributed_training.md index 20cd61d6253bb7392aa9e8c422aeedadfb102ee4..8b04d272e94fdfa5175d1725d687b61d0a7a37d9 100644 --- a/tutorials/source_en/advanced_use/distributed_training.md +++ b/tutorials/source_en/advanced_use/distributed_training.md @@ -54,45 +54,33 @@ The following uses the Ascend 910 AI processor as an example. The JSON configura ```json { - "board_id": "0x0000", - "chip_info": "910", - "deploy_mode": "lab", - "group_count": "1", - "group_list": [ + "version": "1.0", + "server_count": "1", + "server_list": [ { - "device_num": "8", - "server_num": "1", - "group_name": "", - "instance_count": "8", - "instance_list": [ - {"devices": [{"device_id": "0","device_ip": "192.1.27.6"}],"rank_id": "0","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "1","device_ip": "192.2.27.6"}],"rank_id": "1","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "2","device_ip": "192.3.27.6"}],"rank_id": "2","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "3","device_ip": "192.4.27.6"}],"rank_id": "3","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "4","device_ip": "192.1.27.7"}],"rank_id": "4","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "5","device_ip": "192.2.27.7"}],"rank_id": "5","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "6","device_ip": "192.3.27.7"}],"rank_id": "6","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "7","device_ip": "192.4.27.7"}],"rank_id": "7","server_id": "10.155.111.140"} - ] + "server_id": "10.155.111.140", + "deivce": [ + {"device_id": "0","device_ip": "192.1.27.6","rank_id": "0"}, + {"device_id": "1","device_ip": "192.2.27.6","rank_id": "1"}, + {"device_id": "2","device_ip": "192.3.27.6","rank_id": "2"}, + {"device_id": "3","device_ip": "192.4.27.6","rank_id": "3"}, + {"device_id": "4","device_ip": "192.1.27.7","rank_id": "4"}, + {"device_id": "5","device_ip": "192.2.27.7","rank_id": "5"}, + {"device_id": "6","device_ip": "192.3.27.7","rank_id": "6"}, + {"device_id": "7","device_ip": "192.4.27.7","rank_id": "7"}], + "host_nic_ip": "reserve" } ], - "para_plane_nic_location": "device", - "para_plane_nic_name": ["eth0","eth1","eth2","eth3","eth4","eth5","eth6","eth7"], - "para_plane_nic_num": "8", "status": "completed" } - ``` The following parameters need to be modified based on the actual training environment: -- `board_id`: current running environment. Set this parameter to `0x0000` for x86, and to `0x0020` for ARM. -- `server_num`: number of hosts. +- `server_count`: number of hosts. - `server_id`: IP address of the local host. -- `device_num`, `para_plane_nic_num`, and `instance_count`: number of devices. -- `rank_id`: logical sequence number of a device, which starts from 0. - `device_id`: physical sequence number of a device, that is, the actual sequence number of the device on the corresponding host. - `device_ip`: IP address of the integrated NIC. You can run the `cat /etc/hccn.conf` command on the current host. The key value of `address_x` is the IP address of the NIC. -- `para_plane_nic_name`: name of the corresponding NIC. +- `rank_id`: logical sequence number of a device, which starts from 0. ### Calling the Collective Communication Library diff --git a/tutorials/source_zh_cn/advanced_use/distributed_training.md b/tutorials/source_zh_cn/advanced_use/distributed_training.md index 52627db1c5abfa1614582dcab03a7571d2cbff70..fd5d682a8cc02e1815e7b429efd15c1f7e646d9c 100644 --- a/tutorials/source_zh_cn/advanced_use/distributed_training.md +++ b/tutorials/source_zh_cn/advanced_use/distributed_training.md @@ -54,43 +54,34 @@ ```json { - "board_id": "0x0000", - "chip_info": "910", - "deploy_mode": "lab", - "group_count": "1", - "group_list": [ + "version": "1.0", + "server_count": "1", + "server_list": [ { - "device_num": "8", - "server_num": "1", - "group_name": "", - "instance_count": "8", - "instance_list": [ - {"devices": [{"device_id": "0","device_ip": "192.1.27.6"}],"rank_id": "0","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "1","device_ip": "192.2.27.6"}],"rank_id": "1","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "2","device_ip": "192.3.27.6"}],"rank_id": "2","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "3","device_ip": "192.4.27.6"}],"rank_id": "3","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "4","device_ip": "192.1.27.7"}],"rank_id": "4","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "5","device_ip": "192.2.27.7"}],"rank_id": "5","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "6","device_ip": "192.3.27.7"}],"rank_id": "6","server_id": "10.155.111.140"}, - {"devices": [{"device_id": "7","device_ip": "192.4.27.7"}],"rank_id": "7","server_id": "10.155.111.140"} - ] + "server_id": "10.155.111.140", + "deivce": [ + {"device_id": "0","device_ip": "192.1.27.6","rank_id": "0"}, + {"device_id": "1","device_ip": "192.2.27.6","rank_id": "1"}, + {"device_id": "2","device_ip": "192.3.27.6","rank_id": "2"}, + {"device_id": "3","device_ip": "192.4.27.6","rank_id": "3"}, + {"device_id": "4","device_ip": "192.1.27.7","rank_id": "4"}, + {"device_id": "5","device_ip": "192.2.27.7","rank_id": "5"}, + {"device_id": "6","device_ip": "192.3.27.7","rank_id": "6"}, + {"device_id": "7","device_ip": "192.4.27.7","rank_id": "7"}], + "host_nic_ip": "reserve" } ], - "para_plane_nic_location": "device", - "para_plane_nic_name": ["eth0","eth1","eth2","eth3","eth4","eth5","eth6","eth7"], - "para_plane_nic_num": "8", "status": "completed" } - ``` + 其中需要根据实际训练环境修改的参数项有: -- `board_id`表示当前运行的环境,x86设为`0x0000`,arm设为`0x0020`。 -- `server_num`表示机器数量, `server_id`表示本机IP地址。 -- `device_num`、`para_plane_nic_num`及`instance_count`表示卡的数量。 -- `rank_id`表示卡逻辑序号,固定从0开始编号,`device_id`表示卡物理序号,即卡所在机器中的实际序号。 +- `server_count`表示参与训练的机器数量。 +- `server_id`表示当前机器的IP地址。 +- `device_id`表示卡物理序号,即卡所在机器中的实际序号。 - `device_ip`表示集成网卡的IP地址,可以在当前机器执行指令`cat /etc/hccn.conf`,`address_x`的键值就是网卡IP地址。 -- `para_plane_nic_name`对应网卡名称。 +- `rank_id`表示卡逻辑序号,固定从0开始编号。 ### 调用集合通信库 diff --git a/tutorials/tutorial_code/distributed_training/rank_table_2pcs.json b/tutorials/tutorial_code/distributed_training/rank_table_2pcs.json index 5f76d8339be0987e91985b5fed6a82bb7ed5004e..2d053b4a5862e1450158617487d084fe0df90a73 100644 --- a/tutorials/tutorial_code/distributed_training/rank_table_2pcs.json +++ b/tutorials/tutorial_code/distributed_training/rank_table_2pcs.json @@ -1,44 +1,14 @@ { - "board_id": "0x0000", - "chip_info": "910", - "deploy_mode": "lab", - "group_count": "1", - "group_list": [ + "version": "1.0", + "server_count": "1", + "server_list": [ { - "device_num": "2", - "server_num": "1", - "group_name": "", - "instance_count": "2", - "instance_list": [ - { - "devices": [ - { - "device_id": "0", - "device_ip": "192.1.27.6" - } - ], - "rank_id": "0", - "server_id": "10.155.111.140" - }, - { - "devices": [ - { - "device_id": "1", - "device_ip": "192.2.27.6" - } - ], - "rank_id": "1", - "server_id": "10.155.111.140" - } - ] + "server_id": "10.155.111.140", + "deivce": [ + {"device_id": "0","device_ip": "192.1.27.6","rank_id": "0"}, + {"device_id": "1","device_ip": "192.2.27.6","rank_id": "1"}], + "host_nic_ip": "reserve" } ], - "para_plane_nic_location": "device", - "para_plane_nic_name": [ - "eth0", - "eth1" - ], - "para_plane_nic_num": "2", "status": "completed" } - diff --git a/tutorials/tutorial_code/distributed_training/rank_table_8pcs.json b/tutorials/tutorial_code/distributed_training/rank_table_8pcs.json index bcc33e6bf7efdadf1952fb651dfef278935ba009..c5967d83cc850a675e0facc8661391239dace398 100644 --- a/tutorials/tutorial_code/distributed_training/rank_table_8pcs.json +++ b/tutorials/tutorial_code/distributed_training/rank_table_8pcs.json @@ -1,110 +1,20 @@ { - "board_id": "0x0000", - "chip_info": "910", - "deploy_mode": "lab", - "group_count": "1", - "group_list": [ + "version": "1.0", + "server_count": "1", + "server_list": [ { - "device_num": "8", - "server_num": "1", - "group_name": "", - "instance_count": "8", - "instance_list": [ - { - "devices": [ - { - "device_id": "0", - "device_ip": "192.1.27.6" - } - ], - "rank_id": "0", - "server_id": "10.155.111.140" - }, - { - "devices": [ - { - "device_id": "1", - "device_ip": "192.2.27.6" - } - ], - "rank_id": "1", - "server_id": "10.155.111.140" - }, - { - "devices": [ - { - "device_id": "2", - "device_ip": "192.3.27.6" - } - ], - "rank_id": "2", - "server_id": "10.155.111.140" - }, - { - "devices": [ - { - "device_id": "3", - "device_ip": "192.4.27.6" - } - ], - "rank_id": "3", - "server_id": "10.155.111.140" - }, - { - "devices": [ - { - "device_id": "4", - "device_ip": "192.1.27.7" - } - ], - "rank_id": "4", - "server_id": "10.155.111.140" - }, - { - "devices": [ - { - "device_id": "5", - "device_ip": "192.2.27.7" - } - ], - "rank_id": "5", - "server_id": "10.155.111.140" - }, - { - "devices": [ - { - "device_id": "6", - "device_ip": "192.3.27.7" - } - ], - "rank_id": "6", - "server_id": "10.155.111.140" - }, - { - "devices": [ - { - "device_id": "7", - "device_ip": "192.4.27.7" - } - ], - "rank_id": "7", - "server_id": "10.155.111.140" - } - ] + "server_id": "10.155.111.140", + "deivce": [ + {"device_id": "0","device_ip": "192.1.27.6","rank_id": "0"}, + {"device_id": "1","device_ip": "192.2.27.6","rank_id": "1"}, + {"device_id": "2","device_ip": "192.3.27.6","rank_id": "2"}, + {"device_id": "3","device_ip": "192.4.27.6","rank_id": "3"}, + {"device_id": "4","device_ip": "192.1.27.7","rank_id": "4"}, + {"device_id": "5","device_ip": "192.2.27.7","rank_id": "5"}, + {"device_id": "6","device_ip": "192.3.27.7","rank_id": "6"}, + {"device_id": "7","device_ip": "192.4.27.7","rank_id": "7"}], + "host_nic_ip": "reserve" } ], - "para_plane_nic_location": "device", - "para_plane_nic_name": [ - "eth0", - "eth1", - "eth2", - "eth3", - "eth4", - "eth5", - "eth6", - "eth7" - ], - "para_plane_nic_num": "8", "status": "completed" } -