From 1af439252700e85d1ec9c04ac47bc6e099b587a7 Mon Sep 17 00:00:00 2001 From: unknown <115967783@qq.com> Date: Tue, 21 Mar 2023 15:36:05 +0800 Subject: [PATCH 1/5] update Facenet_ID0122_for_TensorFlow after precision improvement --- .../src/facenet.py | 48 ++++++++++----- .../src/fp16fp32.txt | 24 ++++++++ .../src/train_softmax.py | 24 +++++--- .../test/train_full_1p.sh | 46 +++++--------- .../test/train_full_8p.sh | 60 ++++++++----------- 5 files changed, 110 insertions(+), 92 deletions(-) create mode 100644 TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/fp16fp32.txt diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py index 7867143d8..f26031637 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/facenet.py @@ -50,6 +50,15 @@ if "RANK_SIZE" in os.environ: RANK_SIZE = int(os.environ["RANK_SIZE"]) if RANK_SIZE > 1: from npu_bridge.npu_init import * +use_NPU = True +try: + from npu_bridge.npu_init import * +except Exception as e: + print(e) + print("Use GPU to run code.") + use_NPU = False +print("use_NPU:", use_NPU) + # -----8P modified end----- def triplet_loss(anchor, positive, negative, alpha): @@ -181,24 +190,31 @@ def train(total_loss, global_step, optimizer, learning_rate, moving_average_deca loss_averages_op = _add_loss_summaries(total_loss) # Compute gradients. - with tf.control_dependencies([loss_averages_op]): - if optimizer=='ADAGRAD': - opt = tf.train.AdagradOptimizer(learning_rate) - elif optimizer=='ADADELTA': - opt = tf.train.AdadeltaOptimizer(learning_rate, rho=0.9, epsilon=1e-6) - elif optimizer=='ADAM': - opt = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=0.1) - elif optimizer=='RMSPROP': - opt = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.9, epsilon=1.0) - elif optimizer=='MOM': - opt = tf.train.MomentumOptimizer(learning_rate, 0.9, use_nesterov=True) - else: - raise ValueError('Invalid optimization algorithm') + if optimizer=='ADAGRAD': + opt = tf.train.AdagradOptimizer(learning_rate) + elif optimizer=='ADADELTA': + opt = tf.train.AdadeltaOptimizer(learning_rate, rho=0.9, epsilon=1e-6) + elif optimizer=='ADAM': + opt = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=0.1) + elif optimizer=='RMSPROP': + opt = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.9, epsilon=1.0) + elif optimizer=='MOM': + opt = tf.train.MomentumOptimizer(learning_rate, 0.9, use_nesterov=True) + else: + raise ValueError('Invalid optimization algorithm') - # -----8P modified start----- + # -----8P modified start----- + if RANK_SIZE > 1: + opt = npu_distributed_optimizer_wrapper(opt) + # -----8P modified end----- + if use_NPU: + #loss_scale_manager = FixedLossScaleManager(loss_scale=256) + loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=4194304, incr_every_n_steps=2000, decr_every_n_nan_or_inf=1, decr_ratio=0.5) if RANK_SIZE > 1: - opt = npu_distributed_optimizer_wrapper(opt) - # -----8P modified end----- + opt = NPULossScaleOptimizer(opt, loss_scale_manager, is_distributed=True) + else: + opt = NPULossScaleOptimizer(opt, loss_scale_manager) + with tf.control_dependencies([loss_averages_op]): grads = opt.compute_gradients(total_loss, update_gradient_vars) # Apply gradients. diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/fp16fp32.txt b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/fp16fp32.txt new file mode 100644 index 000000000..3a5ec590d --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/fp16fp32.txt @@ -0,0 +1,24 @@ +OpType::MatMul:InputDtype:float32,float32,OutputDtype:float32 +OpType::MatMulV2:InputDtype:float32,float32,OutputDtype:float32 +OpType::BatchMatMul:InputDtype:float32,float32,OutputDtype:float32 +OpType::BatchMatMulV2:InputDtype:float32,float32,OutputDtype:float32 +OpType::Conv2D:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv2DBackpropInputD:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv2DBackpropInput:int32,InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv2DTransposeD:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv2DTranspose:InputDtype:int32,float16,float16,OutputDtype:float32 +OpType::Conv2DBackpropFilterD:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv2DBackpropFilter:InputDtype:float16,int32,float16,OutputDtype:float32 +OpType::DepthwiseConv2D:InputDtype:float16,float16,OutputDtype:float32 +OpType::DepthwiseConv2DBackpropInputD:InputDtype:float16,float16,OutputDtype:float32 +OpType::DepthwiseConv2DBackpropInput:InputDtype:int32,float16,float16,OutputDtype:float32 +OpType::DepthwiseConv2DBackpropFilterD:InputDtype:float16,float16,OutputDtype:float32 +OpType::DepthwiseConv2DBackpropFilter:InputDtype:float16,int32,float16,OutputDtype:float32 +OpType::Deconvolution:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv3D:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv3DBackpropInputD:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv3DBackpropInput:InputDtype:int32,float16,float16,OutputDtype:float32 +OpType::Conv3DTransposeD:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv3DTranspose:InputDtype:int32,float16,float16,OutputDtype:float32 +OpType::Conv3DBackpropFilterD:InputDtype:float16,float16,OutputDtype:float32 +OpType::Conv3DBackpropFilter:InputDtype:float16,int32,float16,OutputDtype:float32 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/train_softmax.py b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/train_softmax.py index 099970790..afc6a0d78 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/train_softmax.py +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/train_softmax.py @@ -201,7 +201,7 @@ def main(args): tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, prelogits_center_loss * args.center_loss_factor) learning_rate = tf.train.exponential_decay(learning_rate_placeholder, global_step, - args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=True) + args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=False) tf.summary.scalar('learning_rate', learning_rate) # Calculate the average cross entropy loss across the batch @@ -222,7 +222,7 @@ def main(args): learning_rate, args.moving_average_decay, tf.global_variables(), args.log_histograms) # Create a saver - saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3) + saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=args.max_nrof_epochs) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() @@ -235,6 +235,9 @@ def main(args): custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True custom_op.parameter_map["mix_compile_mode"].b = True + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_fp32_to_fp16") + # custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + custom_op.parameter_map["customize_dtypes"].s = tf.compat.as_bytes("fp16fp32.txt") config.graph_options.rewrite_options.remapping = RewriterConfig.OFF sess = tf.Session(config=config) else: @@ -295,13 +298,16 @@ def main(args): if not cont: break - - t = time.time() - if len(val_image_list)>0 and ((epoch-1) % args.validate_every_n_epochs == args.validate_every_n_epochs-1 or epoch==args.max_nrof_epochs): - validate(args, sess, epoch, val_image_list, val_label_list, enqueue_op, image_paths_placeholder, labels_placeholder, control_placeholder, - phase_train_placeholder, batch_size_placeholder, - stat, total_loss, regularization_losses, cross_entropy_mean, accuracy, args.validate_every_n_epochs, args.use_fixed_image_standardization) - stat['time_validate'][epoch-1] = time.time() - t + + if use_NPU: + pass + else: + t = time.time() + if len(val_image_list)>0 and ((epoch-1) % args.validate_every_n_epochs == args.validate_every_n_epochs-1 or epoch==args.max_nrof_epochs): + validate(args, sess, epoch, val_image_list, val_label_list, enqueue_op, image_paths_placeholder, labels_placeholder, control_placeholder, + phase_train_placeholder, batch_size_placeholder, + stat, total_loss, regularization_losses, cross_entropy_mean, accuracy, args.validate_every_n_epochs, args.use_fixed_image_standardization) + stat['time_validate'][epoch-1] = time.time() - t # Save variables and the metagraph if it doesn't exist already save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, epoch) diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh index 793979344..40ed553f8 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh @@ -5,7 +5,7 @@ cur_path=`pwd`/../ #export ASCEND_SLOG_PRINT_TO_STDOUT=1 export ENABLE_FORCE_V2_CONTROL=1 -#export ASCEND_DEVICE_ID=7 +export ASCEND_DEVICE_ID=0 #基础参数,需要模型审视修改 #Batch Size batch_size=90 @@ -62,7 +62,9 @@ nohup python3 ${cur_path}/src/train_softmax.py \ --keep_probability 0.8 \ --random_crop \ --random_flip \ - --random_rotate \ + --lfw_distance_metric 1 \ + --lfw_use_flipped_images \ + --lfw_subtract_mean \ --use_fixed_image_standardization \ --learning_rate_schedule_file ${cur_path}/data/learning_rate_schedule_classifier_casia.txt \ --weight_decay 5e-4 \ @@ -102,40 +104,22 @@ grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print $1}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` -ckpt_path=`ls ${cur_path}/src/models/$ASCEND_DEVICE_ID/*/*ckpt-${train_epoch}.index` -ckpt_path=${ckpt_path%.*} -# run evalute -python3 ${cur_path}/src/train_softmax.py \ - --logs_base_dir ${cur_path}/src/logs/ \ - --models_base_dir ${cur_path}/src/models/ \ - --data_dir ${data_path}/CASIA-WebFace_182/ \ - --lfw_dir ${cur_path}/lfw/datasets \ - --pretrained_model ${ckpt_path} \ - --batch_size ${batch_size} \ - --image_size 160 \ - --epoch_size 1 \ - --model_def models.inception_resnet_v1 \ - --optimizer ADAM \ - --learning_rate -1 \ - --max_nrof_epochs 1 \ - --keep_probability 0.8 \ - --random_crop \ - --random_flip \ - --random_rotate \ - --use_fixed_image_standardization \ - --learning_rate_schedule_file ${cur_path}/data/learning_rate_schedule_classifier_casia.txt \ - --weight_decay 5e-4 \ - --embedding_size 512 \ - --validation_set_split_ratio 0 \ - --validate_every_n_epochs 5 \ - --prelogits_norm_loss_factor 5e-4 >> $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 - #train_accuracy -train_accuracy=`grep 'Accuracy:' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $2}' |tail -1` +train_accuracy=`grep -rn 'Accuracy:' $cur_path/test/output/*/* | awk '{print $2}' | awk -F'%' 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'` #打印,不需要修改 echo "train_accuracy : $train_accuracy" +saved_model_path=${cur_path}/src/models/saved_model_path/`date '+%Y%m%d%H%M'` +mkdir $saved_model_path -p +bst_e=`grep -rn -B 10 $train_accuracy $cur_path/test/output/*/* | head -n 1 | awk '{print $2}'` +bst_e=${bst_e%%]*} +bst_e=${bst_e#*[} +cp ${cur_path}/src/models/${ASCEND_DEVICE_ID}/*/*ckpt-${bst_e}* ${saved_model_path} +cp ${cur_path}/src/models/${ASCEND_DEVICE_ID}/*/*.meta ${saved_model_path} +cp ${cur_path}/src/models/${ASCEND_DEVICE_ID}/*/checkpoint ${saved_model_path} +sed -i "1s/ckpt-${train_epoch}/ckpt-${bst_e}/" ${saved_model_path}/checkpoint +echo "saved model path: ${saved_mode_path}/" ##获取错误信息 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh index e54c2b226..235e181ec 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh @@ -59,23 +59,28 @@ do --logs_base_dir ${cur_path}/src/logs/$ASCEND_DEVICE_ID \ --models_base_dir ${cur_path}/src/models/$ASCEND_DEVICE_ID \ --data_dir ${data_path}/CASIA-WebFace_182/ \ + --lfw_dir ${cur_path}/lfw/datasets \ --batch_size ${batch_size} \ --image_size 160 \ --model_def models.inception_resnet_v1 \ --optimizer ADAM \ - --learning_rate -1 \ + --learning_rate 0.6 \ + --learning_rate_decay_epochs 1 \ + --learning_rate_decay_factor 0.7 \ --max_nrof_epochs ${train_epoch} \ - --keep_probability 0.8 \ + --keep_probability 1.0 \ --random_crop \ --random_flip \ - --random_rotate \ + --lfw_distance_metric 1 \ + --lfw_use_flipped_images \ + --lfw_subtract_mean \ --use_fixed_image_standardization \ --learning_rate_schedule_file ${cur_path}/data/learning_rate_schedule_classifier_casia_8p.txt \ --weight_decay 5e-4 \ --embedding_size 512 \ --validation_set_split_ratio 0.05 \ --validate_every_n_epochs 5 \ - --prelogits_norm_loss_factor 5e-4 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + --prelogits_norm_loss_factor 1e-3 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & done wait end=$(date +%s) @@ -110,43 +115,26 @@ grep RegLoss $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print $1}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` -unset RANK_TABLE_FILE -unset RANK_SIZE -ckpt_path=`ls ${cur_path}/src/models/$ASCEND_DEVICE_ID/*/*ckpt-${train_epoch}.index` -ckpt_path=${ckpt_path%.*} -# run evalute -python3 ${cur_path}/src/train_softmax.py \ - --logs_base_dir ${cur_path}/src/logs/ \ - --models_base_dir ${cur_path}/src/models/ \ - --data_dir ${data_path}/CASIA-WebFace_182/ \ - --lfw_dir ${cur_path}/lfw/datasets \ - --pretrained_model ${ckpt_path} \ - --batch_size ${batch_size} \ - --image_size 160 \ - --epoch_size 1 \ - --model_def models.inception_resnet_v1 \ - --optimizer ADAM \ - --learning_rate -1 \ - --max_nrof_epochs 1 \ - --keep_probability 0.8 \ - --random_crop \ - --random_flip \ - --random_rotate \ - --use_fixed_image_standardization \ - --learning_rate_schedule_file ${cur_path}/data/learning_rate_schedule_classifier_casia.txt \ - --weight_decay 5e-4 \ - --embedding_size 512 \ - --validation_set_split_ratio 0 \ - --validate_every_n_epochs 5 \ - --prelogits_norm_loss_factor 5e-4 >> $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 - #train_accuracy -train_accuracy=`grep 'Accuracy:' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $2}' |tail -1` +train_accuracy=`grep -rn 'Accuracy:' $cur_path/test/output/*/* | awk '{print $2}' | awk -F'%' 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'` #打印,不需要修改 echo "train_accuracy : $train_accuracy" -RANK_SIZE=8 +saved_model_path=${cur_path}/src/models/saved_model_path/`date '+%Y%m%d%H%M'` +mkdir $saved_model_path -p +bst_d=`grep -rn $train_accuracy $cur_path/test/output/*/* | head -n 1` +bst_d=${bst_d%/*} +bst_d=${bst_d##*/} +bst_e=`grep -rn -B 10 $train_accuracy $cur_path/test/output/*/* | head -n 1 | awk '{print $2}'` +bst_e=${bst_e%%]*} +bst_e=${bst_e#*[} +cp ${cur_path}/src/models/${bst_d}/*/*ckpt-${bst_e}* ${saved_model_path} +cp ${cur_path}/src/models/${bst_d}/*/*.meta ${saved_model_path} +cp ${cur_path}/src/models/${bst_d}/*/checkpoint ${saved_model_path} +sed -i "1s/ckpt-${train_epoch}/ckpt-${bst_e}/" ${saved_model_path}/checkpoint +echo "saved model path: ${saved_mode_path}/" + ##获取错误信息 #系统错误信息 -- Gitee From 3c3d2372cb7efcccb52abebddb183125da6f686b Mon Sep 17 00:00:00 2001 From: unknown <115967783@qq.com> Date: Tue, 21 Mar 2023 16:07:36 +0800 Subject: [PATCH 2/5] update Facenet_ID0122_for_TensorFlow after precision improvement --- .../Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh | 3 ++- .../Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh | 6 ++---- .../test/train_performance_1p.sh | 2 +- .../test/train_performance_8p.sh | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh index 40ed553f8..ec8bdebaa 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh @@ -53,6 +53,7 @@ nohup python3 ${cur_path}/src/train_softmax.py \ --logs_base_dir ${cur_path}/src/logs/ \ --models_base_dir ${cur_path}/src/models/$ASCEND_DEVICE_ID \ --data_dir ${data_path}/CASIA-WebFace_182/ \ + --lfw_dir ${cur_path}/lfw/datasets \ --batch_size ${batch_size} \ --image_size 160 \ --model_def models.inception_resnet_v1 \ @@ -119,7 +120,7 @@ cp ${cur_path}/src/models/${ASCEND_DEVICE_ID}/*/*ckpt-${bst_e}* ${saved_model_pa cp ${cur_path}/src/models/${ASCEND_DEVICE_ID}/*/*.meta ${saved_model_path} cp ${cur_path}/src/models/${ASCEND_DEVICE_ID}/*/checkpoint ${saved_model_path} sed -i "1s/ckpt-${train_epoch}/ckpt-${bst_e}/" ${saved_model_path}/checkpoint -echo "saved model path: ${saved_mode_path}/" +echo "saved model path: ${saved_model_path}/" ##获取错误信息 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh index 235e181ec..ba4f562ff 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh @@ -2,7 +2,6 @@ cur_path=`pwd`/../ #失败用例打屏 -#export ASCEND_SLOG_PRINT_TO_STDOUT=1 export JOB_ID=10096 export RANK_TABLE_FILE=${cur_path}/test/ranktable_8p.json export ENABLE_FORCE_V2_CONTROL=1 @@ -10,12 +9,11 @@ export ENABLE_FORCE_V2_CONTROL=1 export ASCEND_SLOG_PRINT_TO_STDOUT=0 export ASCEND_GLOBAL_LOG_LEVEL=3 -#export ASCEND_DEVICE_ID=7 #基础参数,需要模型审视修改 #Batch Size batch_size=90 #train epoch number -train_epoch=12 +train_epoch=15 #网络名称,同目录名称 Network="Facenet_ID0122_for_TensorFlow" #Device数量,单卡默认为1 @@ -133,7 +131,7 @@ cp ${cur_path}/src/models/${bst_d}/*/*ckpt-${bst_e}* ${saved_model_path} cp ${cur_path}/src/models/${bst_d}/*/*.meta ${saved_model_path} cp ${cur_path}/src/models/${bst_d}/*/checkpoint ${saved_model_path} sed -i "1s/ckpt-${train_epoch}/ckpt-${bst_e}/" ${saved_model_path}/checkpoint -echo "saved model path: ${saved_mode_path}/" +echo "saved model path: ${saved_model_path}/" ##获取错误信息 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh index 38240503d..db5a414dd 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh @@ -5,7 +5,7 @@ cur_path=`pwd`/../ #export ASCEND_SLOG_PRINT_TO_STDOUT=1 export ENABLE_FORCE_V2_CONTROL=1 -#export ASCEND_DEVICE_ID=7 +export ASCEND_DEVICE_ID=0 #基础参数,需要模型审视修改 #Batch Size batch_size=90 diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh index bcb94c6f3..4c5ebe052 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh @@ -7,7 +7,7 @@ export JOB_ID=10096 export RANK_TABLE_FILE=${cur_path}/test/ranktable_8p.json export ENABLE_FORCE_V2_CONTROL=1 -export ASCEND_SLOG_PRINT_TO_STDOUT=1 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 export ASCEND_GLOBAL_LOG_LEVEL=3 #export ASCEND_DEVICE_ID=7 -- Gitee From a70676d45779469a218463b1dac09d27107c5b7b Mon Sep 17 00:00:00 2001 From: unknown <115967783@qq.com> Date: Tue, 21 Mar 2023 17:18:51 +0800 Subject: [PATCH 3/5] update Facenet_ID0122_for_TensorFlow after precision improvement --- .../Facenet_ID0122_for_TensorFlow/src/train_softmax.py | 2 +- .../{src/fp16fp32.txt => switch_config.txt} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/{src/fp16fp32.txt => switch_config.txt} (100%) diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/train_softmax.py b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/train_softmax.py index afc6a0d78..4042f8871 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/train_softmax.py +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/train_softmax.py @@ -237,7 +237,7 @@ def main(args): custom_op.parameter_map["mix_compile_mode"].b = True custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_fp32_to_fp16") # custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") - custom_op.parameter_map["customize_dtypes"].s = tf.compat.as_bytes("fp16fp32.txt") + custom_op.parameter_map["customize_dtypes"].s = tf.compat.as_bytes("./switch_config.txt") config.graph_options.rewrite_options.remapping = RewriterConfig.OFF sess = tf.Session(config=config) else: diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/fp16fp32.txt b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/switch_config.txt similarity index 100% rename from TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/src/fp16fp32.txt rename to TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/switch_config.txt -- Gitee From 90bb70d38ba88a77e934c93a9b8b5f2d8cb9988f Mon Sep 17 00:00:00 2001 From: unknown <115967783@qq.com> Date: Tue, 21 Mar 2023 17:31:02 +0800 Subject: [PATCH 4/5] update Facenet_ID0122_for_TensorFlow after precision improvement --- .../test/train_performance_1p.sh | 5 ++++- .../test/train_performance_8p.sh | 13 +++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh index db5a414dd..93abfa65a 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh @@ -49,6 +49,7 @@ nohup python3 ${cur_path}/src/train_softmax.py \ --logs_base_dir ${cur_path}/src/logs/ \ --models_base_dir ${cur_path}/src/models/ \ --data_dir ${data_path}/CASIA-WebFace_182/ \ + --lfw_dir ${cur_path}/lfw/datasets \ --batch_size ${batch_size} \ --image_size 160 \ --model_def models.inception_resnet_v1 \ @@ -58,7 +59,9 @@ nohup python3 ${cur_path}/src/train_softmax.py \ --keep_probability 0.8 \ --random_crop \ --random_flip \ - --random_rotate \ + --lfw_distance_metric 1 \ + --lfw_use_flipped_images \ + --lfw_subtract_mean \ --use_fixed_image_standardization \ --learning_rate_schedule_file ${cur_path}/data/learning_rate_schedule_classifier_casia.txt \ --weight_decay 5e-4 \ diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh index 4c5ebe052..d36029e26 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh @@ -55,23 +55,28 @@ do --logs_base_dir ${cur_path}/src/logs/$ASCEND_DEVICE_ID \ --models_base_dir ${cur_path}/src/models/ \ --data_dir ${data_path}/CASIA-WebFace_182/ \ + --lfw_dir ${cur_path}/lfw/datasets \ --batch_size ${batch_size} \ --image_size 160 \ --model_def models.inception_resnet_v1 \ --optimizer ADAM \ - --learning_rate -1 \ + --learning_rate 0.6 \ + --learning_rate_decay_epochs 1 \ + --learning_rate_decay_factor 0.7 \ --max_nrof_epochs 3 \ - --keep_probability 0.8 \ + --keep_probability 1.0 \ --random_crop \ --random_flip \ - --random_rotate \ + --lfw_distance_metric 1 \ + --lfw_use_flipped_images \ + --lfw_subtract_mean \ --use_fixed_image_standardization \ --learning_rate_schedule_file ${cur_path}/data/learning_rate_schedule_classifier_casia_8p.txt \ --weight_decay 5e-4 \ --embedding_size 512 \ --validation_set_split_ratio 0.05 \ --validate_every_n_epochs 5 \ - --prelogits_norm_loss_factor 5e-4 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + --prelogits_norm_loss_factor 1e-3 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & done wait end=$(date +%s) -- Gitee From bd316fde8de3b3d248293333828d953960a639d4 Mon Sep 17 00:00:00 2001 From: unknown <115967783@qq.com> Date: Sat, 25 Mar 2023 15:53:25 +0800 Subject: [PATCH 5/5] update Facenet_ID0122_for_TensorFlow after precision improvement --- .../Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh | 2 +- .../Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh | 2 +- .../Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh | 2 +- .../Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh index ec8bdebaa..33321fd8c 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_1p.sh @@ -53,7 +53,7 @@ nohup python3 ${cur_path}/src/train_softmax.py \ --logs_base_dir ${cur_path}/src/logs/ \ --models_base_dir ${cur_path}/src/models/$ASCEND_DEVICE_ID \ --data_dir ${data_path}/CASIA-WebFace_182/ \ - --lfw_dir ${cur_path}/lfw/datasets \ + --lfw_dir ${data_path}/lfw_mtcnnpy_160/ \ --batch_size ${batch_size} \ --image_size 160 \ --model_def models.inception_resnet_v1 \ diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh index ba4f562ff..7d060246b 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_full_8p.sh @@ -57,7 +57,7 @@ do --logs_base_dir ${cur_path}/src/logs/$ASCEND_DEVICE_ID \ --models_base_dir ${cur_path}/src/models/$ASCEND_DEVICE_ID \ --data_dir ${data_path}/CASIA-WebFace_182/ \ - --lfw_dir ${cur_path}/lfw/datasets \ + --lfw_dir ${data_path}/lfw_mtcnnpy_160/ \ --batch_size ${batch_size} \ --image_size 160 \ --model_def models.inception_resnet_v1 \ diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh index 93abfa65a..a3fab0377 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_1p.sh @@ -49,7 +49,7 @@ nohup python3 ${cur_path}/src/train_softmax.py \ --logs_base_dir ${cur_path}/src/logs/ \ --models_base_dir ${cur_path}/src/models/ \ --data_dir ${data_path}/CASIA-WebFace_182/ \ - --lfw_dir ${cur_path}/lfw/datasets \ + --lfw_dir ${data_path}/lfw_mtcnnpy_160/ \ --batch_size ${batch_size} \ --image_size 160 \ --model_def models.inception_resnet_v1 \ diff --git a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh index d36029e26..9342863ad 100644 --- a/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/Facenet_ID0122_for_TensorFlow/test/train_performance_8p.sh @@ -55,7 +55,7 @@ do --logs_base_dir ${cur_path}/src/logs/$ASCEND_DEVICE_ID \ --models_base_dir ${cur_path}/src/models/ \ --data_dir ${data_path}/CASIA-WebFace_182/ \ - --lfw_dir ${cur_path}/lfw/datasets \ + --lfw_dir ${data_path}/lfw_mtcnnpy_160/ \ --batch_size ${batch_size} \ --image_size 160 \ --model_def models.inception_resnet_v1 \ -- Gitee