From 6cd5ec70ae9539d736bf868ce05bc8ad599c41f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:31:22 +0000 Subject: [PATCH 01/20] update train_full_1p.sh. --- .../BertBase_ID0490_for_PyTorch/test/train_full_1p.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh index abe4444d24..fcbca90608 100644 --- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh @@ -64,6 +64,7 @@ if [[ $data_path == "" ]];then exit 1 fi +export RANK=0 #训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee From bcfaa8656891e32be3175bbf98afc64b8eabb8be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:31:56 +0000 Subject: [PATCH 02/20] update train_performance_1p.sh. --- .../BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh index 884fd0cc89..f09723a2fa 100644 --- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh @@ -64,6 +64,7 @@ if [[ $data_path == "" ]];then exit 1 fi +export RANK=0 #训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee From 48cdacaa117eb42aadcce30c2193dd228e4b1376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:34:43 +0000 Subject: [PATCH 03/20] update main_apex_d76_npu.py. --- .../DistributedResnet50/main_apex_d76_npu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py index 26edd676ce..34329f23a4 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py @@ -416,8 +416,10 @@ def main_worker(gpu, ngpus_per_node, args): args.rank = args.rank * ngpus_per_node + gpu if args.device == 'npu': + RANK = int(os.environ["rank_server"]) + print("args.rank+RANK :", args.rank+RANK) dist.init_process_group(backend=args.dist_backend, - world_size=args.world_size, rank=args.rank) + world_size=args.world_size, rank=args.rank+RANK) else: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) -- Gitee From 8869c20d2be6fbadf14bb8e0cb108ca0bf1ba17e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:36:17 +0000 Subject: [PATCH 04/20] add train_performance_16p.sh. --- .../test/train_performance_16p.sh | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000..d602403ad9 --- /dev/null +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="ResNet50_ID0095_for_PyTorch" +# 训练batch_size +batch_size=4096 +# 训练使用的npu卡数 +export RANK_SIZE=16 +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" + +# 训练epoch 90 +train_epochs=3 +# 加载数据进程数 +workers=128 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +# 训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +export rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` + +python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ + --data ${data_path} \ + --addr=$one_node_ip \ + --seed=49 \ + --workers=${workers} \ + --learning-rate=1.6 \ + --warmup=8 \ + --label-smoothing=0.1 \ + --mom=0.9 \ + --weight-decay=1.0e-04 \ + --static-loss-scale=128 \ + --print-freq=1 \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend='hccl' \ + --multiprocessing-distributed \ + --world-size=2 \ + --rank=0 \ + --benchmark=0 \ + --device='npu' \ + --epochs=${train_epochs} \ + --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +grep "FPS@all" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $11}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_fps.log +FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +# 打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# 输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +# 打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +# 性能看护结果汇总 +# 获取性能数据,不需要修改 +# 吞吐量 +ActualFPS=${FPS} +# 单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +# 最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 0ca299265f51daa4eb1ed8ddc09ad3e65b60c4e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:37:01 +0000 Subject: [PATCH 05/20] update train_performance_8p.sh. --- .../ResNet50_for_PyTorch/test/train_performance_8p.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh index 850d3dda9e..b5353c8fab 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh @@ -63,6 +63,8 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi +export rank_server=0 + python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ -- Gitee From 5795272857282272759ea0a43ac26358ca757fa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:38:25 +0000 Subject: [PATCH 06/20] update train_full_8p.sh. --- .../ResNet50_for_PyTorch/test/train_full_8p.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 56b6147f44..50e8908c31 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -51,9 +51,9 @@ do #设置环境变量,不需要修改 export ASCEND_DEVICE_ID=$RANK_ID echo "Device ID: $ASCEND_DEVICE_ID" - - - + + + #创建DeviceID输出目录,不需要修改 if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} @@ -66,7 +66,7 @@ do #SOLVER.MAX_ITER 82000 \ #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -done +done #################启动训练脚本################# # 训练开始时间,不需要修改 -- Gitee From df7c5326962221d0a7dc78c07d22b12a2f9088c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:40:13 +0000 Subject: [PATCH 07/20] update train_full_8p.sh. --- .../ResNet50_for_PyTorch/test/train_full_8p.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 50e8908c31..56b6147f44 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -51,9 +51,9 @@ do #设置环境变量,不需要修改 export ASCEND_DEVICE_ID=$RANK_ID echo "Device ID: $ASCEND_DEVICE_ID" - - - + + + #创建DeviceID输出目录,不需要修改 if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} @@ -66,7 +66,7 @@ do #SOLVER.MAX_ITER 82000 \ #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -done +done #################启动训练脚本################# # 训练开始时间,不需要修改 -- Gitee From 4e7f23c24b790f87f3194ad8489ff710f0de4836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:41:21 +0000 Subject: [PATCH 08/20] update train_full_8p.sh. --- .../cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 56b6147f44..b609a83864 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -68,6 +68,7 @@ do #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 done +export rank_server=0 #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee From 3c660f8d6bb6364cd58df3a8b54a3c086ef83f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:49:22 +0000 Subject: [PATCH 09/20] update train_performance_16p.sh. --- .../ResNet50_for_PyTorch/test/train_performance_16p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh index d602403ad9..33678576e3 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh @@ -120,7 +120,7 @@ CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 grep "FPS@all" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $11}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_fps.log -FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a*2/NR)}'` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" -- Gitee From 1ad6284ec5c5f4f8bedd58ece1a9dd1d21de727d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 06:40:45 +0000 Subject: [PATCH 10/20] update train_performance_16p.sh. --- .../ResNet50_for_PyTorch/test/train_performance_16p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh index 33678576e3..2258c682a9 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh @@ -78,7 +78,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -export rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export RANK=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ -- Gitee From 3f3ffd81824553efab3580aaf4e0e36309cfb4ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 06:49:00 +0000 Subject: [PATCH 11/20] update train_performance_16p.sh. --- .../ResNet50_for_PyTorch/test/train_performance_16p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh index 2258c682a9..800842db03 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh @@ -78,7 +78,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -export RANK=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NUM_NODES=${server_index} export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ -- Gitee From 37053387dfd6a06d1684ab9be5967f0b5c71b83a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 06:52:56 +0000 Subject: [PATCH 12/20] update main_apex_d76_npu.py. --- .../DistributedResnet50/main_apex_d76_npu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py index 34329f23a4..e17a439586 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py @@ -416,10 +416,10 @@ def main_worker(gpu, ngpus_per_node, args): args.rank = args.rank * ngpus_per_node + gpu if args.device == 'npu': - RANK = int(os.environ["rank_server"]) - print("args.rank+RANK :", args.rank+RANK) + RANK = int(os.environ["NUM_NODES"]) * 8 + args.rank + print("the global_rank is :", RANK) dist.init_process_group(backend=args.dist_backend, - world_size=args.world_size, rank=args.rank+RANK) + world_size=args.world_size, rank=RANK) else: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) -- Gitee From 61c4fbe6ec441dc3268974b3433d873173e54c5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 06:54:28 +0000 Subject: [PATCH 13/20] update train_full_8p.sh. --- .../classification/ResNet50_for_PyTorch/test/train_full_8p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index b609a83864..81bef43e2d 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -68,7 +68,7 @@ do #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 done -export rank_server=0 +export NUM_NODES=0 #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee From 5dc2195e99db17d96a11c4c53047437ae4449047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 06:56:15 +0000 Subject: [PATCH 14/20] update train_performance_8p.sh. --- .../ResNet50_for_PyTorch/test/train_performance_8p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh index b5353c8fab..b566bf2ada 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh @@ -63,7 +63,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -export rank_server=0 +export NUM_NODES=0 python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ -- Gitee From 2ae975249f10509514c14746cc1ffbcb83511b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 07:44:53 +0000 Subject: [PATCH 15/20] update train_performance_16p.sh. --- .../ResNet50_for_PyTorch/test/train_performance_16p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh index 800842db03..cea3033577 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh @@ -78,7 +78,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -export NUM_NODES=${server_index} +export NODE_RANK=${server_index} export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ -- Gitee From f93a973f69ffbace77ef6b52bdafcce9b4e08a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 07:45:54 +0000 Subject: [PATCH 16/20] update main_apex_d76_npu.py. --- .../DistributedResnet50/main_apex_d76_npu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py index e17a439586..2bd40c9fe5 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py @@ -416,7 +416,7 @@ def main_worker(gpu, ngpus_per_node, args): args.rank = args.rank * ngpus_per_node + gpu if args.device == 'npu': - RANK = int(os.environ["NUM_NODES"]) * 8 + args.rank + RANK = int(os.environ["NODE_RANK"]) * 8 + args.rank print("the global_rank is :", RANK) dist.init_process_group(backend=args.dist_backend, world_size=args.world_size, rank=RANK) -- Gitee From f33ce1221d29ca534ee0fd8bc6c742e39cd9f64e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 07:47:10 +0000 Subject: [PATCH 17/20] update train_full_8p.sh. --- .../classification/ResNet50_for_PyTorch/test/train_full_8p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 81bef43e2d..1cc50890bd 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -68,7 +68,7 @@ do #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 done -export NUM_NODES=0 +export NODE_RANK=0 #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee From e8a00387908c7216f26058e7b714b9e11ede6a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 07:48:04 +0000 Subject: [PATCH 18/20] update train_performance_8p.sh. --- .../ResNet50_for_PyTorch/test/train_performance_8p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh index b566bf2ada..84c999e576 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh @@ -63,7 +63,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -export NUM_NODES=0 +export NODE_RANK=0 python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ -- Gitee From 701cb81fb6ae01841eb47dec608f3591dea90779 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 07:51:45 +0000 Subject: [PATCH 19/20] update main_apex_d76_npu.py. --- .../DistributedResnet50/main_apex_d76_npu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py index 2bd40c9fe5..bfbffbb96f 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py @@ -416,10 +416,10 @@ def main_worker(gpu, ngpus_per_node, args): args.rank = args.rank * ngpus_per_node + gpu if args.device == 'npu': - RANK = int(os.environ["NODE_RANK"]) * 8 + args.rank - print("the global_rank is :", RANK) + rank = int(os.environ["NODE_RANK"]) * 8 + args.rank + print("the global_rank is :", rank) dist.init_process_group(backend=args.dist_backend, - world_size=args.world_size, rank=RANK) + world_size=args.world_size, rank=rank) else: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) -- Gitee From 0a4442c6503cf69ca15b5a3e34b3ee2c302994de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 07:56:08 +0000 Subject: [PATCH 20/20] update main_apex_d76_npu.py. --- .../DistributedResnet50/main_apex_d76_npu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py index bfbffbb96f..3176314e25 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py @@ -416,10 +416,10 @@ def main_worker(gpu, ngpus_per_node, args): args.rank = args.rank * ngpus_per_node + gpu if args.device == 'npu': - rank = int(os.environ["NODE_RANK"]) * 8 + args.rank - print("the global_rank is :", rank) + args.rank = int(os.environ["NODE_RANK"]) * 8 + args.rank + print("the global_rank is :", args.rank) dist.init_process_group(backend=args.dist_backend, - world_size=args.world_size, rank=rank) + world_size=args.world_size, rank=args.rank) else: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) -- Gitee