From 3564dfc36833d0c764316997ac1892f44119f974 Mon Sep 17 00:00:00 2001 From: xuzongqi <1450345865@qq.com> Date: Thu, 7 Apr 2022 13:32:48 +0000 Subject: [PATCH 1/3] update train_full_8p.sh. --- .../test/train_full_8p.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/GoogleNet_ID0051_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/GoogleNet_ID0051_for_TensorFlow/test/train_full_8p.sh index 37efbe5d9..f4506c7dc 100644 --- a/TensorFlow/built-in/cv/image_classification/GoogleNet_ID0051_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/GoogleNet_ID0051_for_TensorFlow/test/train_full_8p.sh @@ -124,10 +124,10 @@ do --iterations_per_loop=100 \ --batch_size=$batch_size \ --display_every=256 \ - --data_path=$data_path \ + --data_path=$data_path \ --lr=0.01 \ - --log_dir=./model \ - --eval_dir=./model \ + --log_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt \ + --eval_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt \ --epochs_between_evals=50 \ --log_name=googlenet.log > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done @@ -145,14 +145,14 @@ BatchSize=${batch_size} DeviceType=`uname -m` CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' #获取性能数据 -FPS=`grep epoch $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F 'FPS:' '{print $2}'|awk '{print $1}'|awk 'NR>1'|awk '{sum+=$1} END {print sum/NR}'` +FPS=`grep epoch $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $6}'|grep -v UTC|awk '{print $1}'|awk '{sum+=$1} END {print sum/NR}'` ActualFPS=$FPS temp1=`echo "8000 * ${batch_size}"|bc` TrainingTime=`echo "scale=2;${temp1} / ${ActualFPS}"|bc` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep epoch $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F 'loss:' '{print $2}'|awk '{print $1}'> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep epoch $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F 'loss:' '{print $2}'|awk '{print $1}'|tr -s '\n'> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` -- Gitee From de4b09a88e38a3be384d0e42bbe69e34ebb60065 Mon Sep 17 00:00:00 2001 From: xuzongqi <1450345865@qq.com> Date: Thu, 7 Apr 2022 13:34:45 +0000 Subject: [PATCH 2/3] update train_full_8p.sh. --- .../test/train_full_8p.sh | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_8p.sh index 50fd36ef3..71ba419cf 100644 --- a/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/AlexNet_ID0072_for_TensorFlow/test/train_full_8p.sh @@ -140,19 +140,28 @@ python3.7 ${cur_path}/../train.py --rank_size=8 \ done wait #设置环境变量,不需要修改 -export RANK_ID=7 -export DEVICE_INDEX=7 -export ASCEND_DEVICE_ID=7 -python3 ${cur_path}/../train.py --rank_size=8 \ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + export DEVICE_ID=$RANK_ID + DEVICE_INDEX=$RANK_ID + export DEVICE_INDEX=${DEVICE_INDEX} + python3 ${cur_path}/../train.py --rank_size=8 \ --epochs_between_evals=1 \ --mode=evaluate \ - --max_epochs=150 \ + --max_epochs=150 \ --iterations_per_loop=100 \ - --batch_size=${batch_size} \ - --data_dir=${data_path} \ - --lr=0.06 \ + --batch_size=${batch_size} \ + --data_dir=${data_path} \ + --lr=0.06 \ --checkpoint_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ - --log_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --log_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + done wait @@ -189,7 +198,7 @@ ActualFPS=${step_sec} TrainingTime=`expr ${batch_size} \* 1000 / ${step_sec}` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -`grep total_loss ${cur_path}/output/7/train_8.log|awk '{print $9}'|tr -d , >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +`grep total_loss ${cur_path}/output/7/train_7.log|awk '{print $9}'|tr -d , >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` -- Gitee From 55acebcc1d0f9c388285729cda5b50146fcebdab Mon Sep 17 00:00:00 2001 From: xuzongqi <1450345865@qq.com> Date: Thu, 7 Apr 2022 13:40:45 +0000 Subject: [PATCH 3/3] update train_full_8p.sh. --- .../test/train_full_8p.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet101_ID0063_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/ResNet101_ID0063_for_TensorFlow/test/train_full_8p.sh index b47200873..bb42eb06b 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet101_ID0063_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet101_ID0063_for_TensorFlow/test/train_full_8p.sh @@ -148,9 +148,17 @@ do --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait - export RANK_ID=7 - export DEVICE_INDEX=7 - export ASCEND_DEVICE_ID=7 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + export DEVICE_ID=$RANK_ID + DEVICE_INDEX=$RANK_ID + export DEVICE_INDEX=${DEVICE_INDEX} python3.7 $cur_path/../r1/resnet/imagenet_main.py \ --resnet_size=101 \ --batch_size=${batch_size} \ -- Gitee