diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py index f26031637d7604ffe1bad4d9dc145da1cf3dbb90..b6ccb34c3af3c1289417b8e64c073e7d425b80d1 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py @@ -208,8 +208,8 @@ def train(total_loss, global_step, optimizer, learning_rate, moving_average_deca opt = npu_distributed_optimizer_wrapper(opt) # -----8P modified end----- if use_NPU: - #loss_scale_manager = FixedLossScaleManager(loss_scale=256) - loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=4194304, incr_every_n_steps=2000, decr_every_n_nan_or_inf=1, decr_ratio=0.5) + loss_scale_manager = FixedLossScaleManager(loss_scale=2097156, enable_overflow_check=False) + #loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=4194304, incr_every_n_steps=2000, decr_every_n_nan_or_inf=1, decr_ratio=0.5) if RANK_SIZE > 1: opt = NPULossScaleOptimizer(opt, loss_scale_manager, is_distributed=True) else: diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh index 012d534c7d0abe17e53fd6ad271ddf4244ade2df..9328bc68c126f0916d49baae173b357153ce8de7 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh @@ -86,7 +86,7 @@ echo "Final Training Duration sec : $e2etime" #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| tail -n +2 | head -4999 | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` +TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` wait FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` #打印,不需要修改 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh index fd98f04eabdae3dfeaf9e62de2a9a47ee66646a6..ad90c8111eb8153eea0f95f7e35b87ce33eb511f 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh @@ -33,6 +33,9 @@ for para in $* do if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" elif [[ $para == --switch_config* ]];then switch_config=`echo ${para#*=}` fi @@ -56,7 +59,20 @@ do mkdir -p $cur_path/src/logs/$ASCEND_DEVICE_ID rm -rf ${cur_path}/src/models/$ASCEND_DEVICE_ID mkdir -p ${cur_path}/src/models/$ASCEND_DEVICE_ID - nohup python3 ${cur_path}/src/train_softmax.py \ + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ${cur_path}/src/train_softmax.py \ --logs_base_dir ${cur_path}/src/logs/$ASCEND_DEVICE_ID \ --models_base_dir ${cur_path}/src/models/$ASCEND_DEVICE_ID \ --data_dir ${data_path}/CASIA-WebFace_182/ \ @@ -93,7 +109,7 @@ echo "Final Training Duration sec : $e2etime" #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| tail -n +2 | head -2999 | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` +TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` wait FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` FPS=$(awk 'BEGIN{print '$FPS'*8}') @@ -107,7 +123,7 @@ BatchSize=${batch_size} #设备类型,自动获取 DeviceType=`uname -m` #用例名称,自动获取 -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh index 6da62369e24097c0e251b7c087a059df12eeff8f..6cbe521fd0a5a6ac9e348ec2c9194f552e4dc148 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh @@ -82,7 +82,7 @@ echo "Final Training Duration sec : $e2etime" #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| tail -n +2 | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` +TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` wait FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` #打印,不需要修改 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh index 3673d4f41be6db4d13bda562f6931b7b18b490f8..d14ac258bb4ccffa2b3d3540cd59baf5e715aeca 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh @@ -33,6 +33,9 @@ for para in $* do if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" elif [[ $para == --switch_config* ]];then switch_config=`echo ${para#*=}` fi @@ -54,7 +57,20 @@ do export RANK_ID=${ID} mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID mkdir -p $cur_path/src/logs/$ASCEND_DEVICE_ID - nohup python3 ${cur_path}/src/train_softmax.py \ + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ${cur_path}/src/train_softmax.py \ --logs_base_dir ${cur_path}/src/logs/$ASCEND_DEVICE_ID \ --models_base_dir ${cur_path}/src/models/ \ --data_dir ${data_path}/CASIA-WebFace_182/ \ @@ -91,7 +107,7 @@ echo "Final Training Duration sec : $e2etime" #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| tail -n +2 | awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` +TrainingTime=`grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $4}' | tr -d s | awk '{sum+=$1} END {print sum/NR}'` wait FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` FPS=$(awk 'BEGIN{print '$FPS'*8}') @@ -105,7 +121,7 @@ BatchSize=${batch_size} #设备类型,自动获取 DeviceType=`uname -m` #用例名称,自动获取 -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视