From 6cd5ec70ae9539d736bf868ce05bc8ad599c41f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:31:22 +0000 Subject: [PATCH 1/8] update train_full_1p.sh. --- .../BertBase_ID0490_for_PyTorch/test/train_full_1p.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh index abe4444d24..fcbca90608 100644 --- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_1p.sh @@ -64,6 +64,7 @@ if [[ $data_path == "" ]];then exit 1 fi +export RANK=0 #训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee From bcfaa8656891e32be3175bbf98afc64b8eabb8be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:31:56 +0000 Subject: [PATCH 2/8] update train_performance_1p.sh. --- .../BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh index 884fd0cc89..f09723a2fa 100644 --- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_1p.sh @@ -64,6 +64,7 @@ if [[ $data_path == "" ]];then exit 1 fi +export RANK=0 #训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee From 48cdacaa117eb42aadcce30c2193dd228e4b1376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:34:43 +0000 Subject: [PATCH 3/8] update main_apex_d76_npu.py. --- .../DistributedResnet50/main_apex_d76_npu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py index 26edd676ce..34329f23a4 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py @@ -416,8 +416,10 @@ def main_worker(gpu, ngpus_per_node, args): args.rank = args.rank * ngpus_per_node + gpu if args.device == 'npu': + RANK = int(os.environ["rank_server"]) + print("args.rank+RANK :", args.rank+RANK) dist.init_process_group(backend=args.dist_backend, - world_size=args.world_size, rank=args.rank) + world_size=args.world_size, rank=args.rank+RANK) else: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) -- Gitee From 8869c20d2be6fbadf14bb8e0cb108ca0bf1ba17e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:36:17 +0000 Subject: [PATCH 4/8] add train_performance_16p.sh. --- .../test/train_performance_16p.sh | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000..d602403ad9 --- /dev/null +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="ResNet50_ID0095_for_PyTorch" +# 训练batch_size +batch_size=4096 +# 训练使用的npu卡数 +export RANK_SIZE=16 +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" + +# 训练epoch 90 +train_epochs=3 +# 加载数据进程数 +workers=128 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +# 训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +export rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` + +python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ + --data ${data_path} \ + --addr=$one_node_ip \ + --seed=49 \ + --workers=${workers} \ + --learning-rate=1.6 \ + --warmup=8 \ + --label-smoothing=0.1 \ + --mom=0.9 \ + --weight-decay=1.0e-04 \ + --static-loss-scale=128 \ + --print-freq=1 \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend='hccl' \ + --multiprocessing-distributed \ + --world-size=2 \ + --rank=0 \ + --benchmark=0 \ + --device='npu' \ + --epochs=${train_epochs} \ + --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +# 训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +grep "FPS@all" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $11}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_fps.log +FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` +# 打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# 输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +# 打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +# 性能看护结果汇总 +# 获取性能数据,不需要修改 +# 吞吐量 +ActualFPS=${FPS} +# 单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +# 最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 0ca299265f51daa4eb1ed8ddc09ad3e65b60c4e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:37:01 +0000 Subject: [PATCH 5/8] update train_performance_8p.sh. --- .../ResNet50_for_PyTorch/test/train_performance_8p.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh index 850d3dda9e..b5353c8fab 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh @@ -63,6 +63,8 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi +export rank_server=0 + python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ -- Gitee From 5795272857282272759ea0a43ac26358ca757fa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:38:25 +0000 Subject: [PATCH 6/8] update train_full_8p.sh. --- .../ResNet50_for_PyTorch/test/train_full_8p.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 56b6147f44..50e8908c31 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -51,9 +51,9 @@ do #设置环境变量,不需要修改 export ASCEND_DEVICE_ID=$RANK_ID echo "Device ID: $ASCEND_DEVICE_ID" - - - + + + #创建DeviceID输出目录,不需要修改 if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} @@ -66,7 +66,7 @@ do #SOLVER.MAX_ITER 82000 \ #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -done +done #################启动训练脚本################# # 训练开始时间,不需要修改 -- Gitee From df7c5326962221d0a7dc78c07d22b12a2f9088c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:40:13 +0000 Subject: [PATCH 7/8] update train_full_8p.sh. --- .../ResNet50_for_PyTorch/test/train_full_8p.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 50e8908c31..56b6147f44 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -51,9 +51,9 @@ do #设置环境变量,不需要修改 export ASCEND_DEVICE_ID=$RANK_ID echo "Device ID: $ASCEND_DEVICE_ID" - - - + + + #创建DeviceID输出目录,不需要修改 if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} @@ -66,7 +66,7 @@ do #SOLVER.MAX_ITER 82000 \ #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -done +done #################启动训练脚本################# # 训练开始时间,不需要修改 -- Gitee From 4e7f23c24b790f87f3194ad8489ff710f0de4836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E6=9C=8B?= <3233877662@qq.com> Date: Thu, 7 Apr 2022 02:41:21 +0000 Subject: [PATCH 8/8] update train_full_8p.sh. --- .../cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh index 56b6147f44..b609a83864 100644 --- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh @@ -68,6 +68,7 @@ do #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 done +export rank_server=0 #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -- Gitee