From 86203378dc3c9f73760fd788617c01ee75a844a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=8E=E6=9C=A8=E6=9E=97?= <762129126@qq.com> Date: Mon, 15 May 2023 11:39:32 +0800 Subject: [PATCH] =?UTF-8?q?[UPDATE]=E4=BF=AE=E6=94=B9=E4=B8=BA=E9=9D=99?= =?UTF-8?q?=E6=80=81loss=5Fscale=EF=BC=9B[UPDATE]=E4=BF=AE=E6=94=B9fps?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E6=96=B9=E5=BC=8F=E4=B8=BA=E5=8F=96=E5=B9=B3?= =?UTF-8?q?=E5=9D=87=E5=80=BC=EF=BC=9B[ADD]8p=E6=8B=89=E8=B5=B7=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E5=A2=9E=E5=8A=A0=E7=BB=91=E6=A0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/facenet.py | 4 ++-- .../test/train_full_1p.sh | 2 +- .../test/train_full_8p.sh | 22 ++++++++++++++++--- .../test/train_performance_1p.sh | 2 +- .../test/train_performance_8p.sh | 22 ++++++++++++++++--- 5 files changed, 42 insertions(+), 10 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py index f26031637..b6ccb34c3 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py @@ -208,8 +208,8 @@ def train(total_loss, global_step, optimizer, learning_rate, moving_average_deca opt = npu_distributed_optimizer_wrapper(opt) # -----8P modified end----- if use_NPU: - #loss_scale_manager = FixedLossScaleManager(loss_scale=256) - loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=4194304, incr_every_n_steps=2000, decr_every_n_nan_or_inf=1, decr_ratio=0.5) + loss_scale_manager = FixedLossScaleManager(loss_scale=2097156, enable_overflow_check=False) + #loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=4194304, incr_every_n_steps=2000, decr_every_n_nan_or_inf=1, decr_ratio=0.5) if RANK_SIZE > 1: opt = NPULossScaleOptimizer(opt, loss_scale_manager, is_distributed=True) else: diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh index 012d534c7..9328bc68c 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh @@ -86,7 +86,7 @@ echo "Final Training Duration sec : $e2etime" #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| tail -n +2 | head -4999 | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` +TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` wait FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` #打印,不需要修改 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh index fd98f04ea..ad90c8111 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh @@ -33,6 +33,9 @@ for para in $* do if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" elif [[ $para == --switch_config* ]];then switch_config=`echo ${para#*=}` fi @@ -56,7 +59,20 @@ do mkdir -p $cur_path/src/logs/$ASCEND_DEVICE_ID rm -rf ${cur_path}/src/models/$ASCEND_DEVICE_ID mkdir -p ${cur_path}/src/models/$ASCEND_DEVICE_ID - nohup python3 ${cur_path}/src/train_softmax.py \ + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ${cur_path}/src/train_softmax.py \ --logs_base_dir ${cur_path}/src/logs/$ASCEND_DEVICE_ID \ --models_base_dir ${cur_path}/src/models/$ASCEND_DEVICE_ID \ --data_dir ${data_path}/CASIA-WebFace_182/ \ @@ -93,7 +109,7 @@ echo "Final Training Duration sec : $e2etime" #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| tail -n +2 | head -2999 | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` +TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` wait FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` FPS=$(awk 'BEGIN{print '$FPS'*8}') @@ -107,7 +123,7 @@ BatchSize=${batch_size} #设备类型,自动获取 DeviceType=`uname -m` #用例名称,自动获取 -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh index 6da62369e..6cbe521fd 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh @@ -82,7 +82,7 @@ echo "Final Training Duration sec : $e2etime" #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| tail -n +2 | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` +TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` wait FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` #打印,不需要修改 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh index 3673d4f41..d14ac258b 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh @@ -33,6 +33,9 @@ for para in $* do if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" elif [[ $para == --switch_config* ]];then switch_config=`echo ${para#*=}` fi @@ -54,7 +57,20 @@ do export RANK_ID=${ID} mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID mkdir -p $cur_path/src/logs/$ASCEND_DEVICE_ID - nohup python3 ${cur_path}/src/train_softmax.py \ + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ${cur_path}/src/train_softmax.py \ --logs_base_dir ${cur_path}/src/logs/$ASCEND_DEVICE_ID \ --models_base_dir ${cur_path}/src/models/ \ --data_dir ${data_path}/CASIA-WebFace_182/ \ @@ -91,7 +107,7 @@ echo "Final Training Duration sec : $e2etime" #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| tail -n +2 | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` +TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` wait FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` FPS=$(awk 'BEGIN{print '$FPS'*8}') @@ -105,7 +121,7 @@ BatchSize=${batch_size} #设备类型,自动获取 DeviceType=`uname -m` #用例名称,自动获取 -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -- Gitee