From 331f808b4cb92710ac5d8546a0e0dc5189900a36 Mon Sep 17 00:00:00 2001 From: hxxhl88 <736544296@qq.com> Date: Thu, 14 Apr 2022 16:44:38 +0800 Subject: [PATCH] modify dropout --- .../train_full_base_8p_192bs_lamb_phase1.sh | 2 +- ...in_performance_base_1p_24bs_lamb_phase1.sh | 2 +- ...rmance_base_1p_24bs_lamb_phase1_dropout.sh | 220 ------------------ ...n_performance_base_8p_192bs_lamb_phase1.sh | 2 +- 4 files changed, 3 insertions(+), 223 deletions(-) delete mode 100644 TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_1p_24bs_lamb_phase1_dropout.sh diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_full_base_8p_192bs_lamb_phase1.sh b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_full_base_8p_192bs_lamb_phase1.sh index 0770515e1..59e0bf5c3 100644 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_full_base_8p_192bs_lamb_phase1.sh +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_full_base_8p_192bs_lamb_phase1.sh @@ -138,7 +138,7 @@ do --max_predictions_per_seq=20 \ --max_seq_length=128 \ --mixlist_file='ops_info_mul.json' \ - --use_npu_dropout=False \ + --use_npu_dropout=True \ --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \ --num_accumulation_steps=1 \ --distribution_strategy=one_device \ diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_1p_24bs_lamb_phase1.sh b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_1p_24bs_lamb_phase1.sh index df44c61ce..4d89fe0bc 100644 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_1p_24bs_lamb_phase1.sh +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_1p_24bs_lamb_phase1.sh @@ -138,7 +138,7 @@ do --loss_scale=dynamic \ --max_predictions_per_seq=20 \ --max_seq_length=128 \ - --use_npu_dropout=False \ + --use_npu_dropout=True \ --mixlist_file='ops_info_mul.json' \ --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \ --num_accumulation_steps=1 \ diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_1p_24bs_lamb_phase1_dropout.sh b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_1p_24bs_lamb_phase1_dropout.sh deleted file mode 100644 index c47d74740..000000000 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_1p_24bs_lamb_phase1_dropout.sh +++ /dev/null @@ -1,220 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 - -export RANK_SIZE=1 -export JOB_ID=10087 -export RANK_ID_START=0 - - -# 数据集路径,保持为空,不需要修改 -data_path="" - -#设置默认日志级别,不需要修改 - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="BertLarge_ID0634_for_TensorFlow2.X" -#训练batch_size -eval_batch_size=4 -batch_size=24 -#训练step -train_steps=1000 -#训练epoch -train_epochs=`expr 768 / ${batch_size}` -#学习率 -learning_rate=0.000058711 - -#TF2.X独有,不需要修改 -#export NPU_LOOP_SIZE=${train_steps} -export NPU_LOOP_SIZE=100 -export NPU_ENABLE_PERF=true -export GE_USE_STATIC_MEMORY=1 - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_1p.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -train_files_path=${data_path}/'train_phase1/*' #need modify to actual path -eval_files_path=${data_path}/'eval_phase1/eval.tfrecord' #need modify to actual path - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 - -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - echo "Device ID: $ASCEND_DEVICE_ID" - export RANK_ID=$RANK_ID - - - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} - fi - - #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 - cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` - cpustep=`expr $cpucount / 8` - echo "taskset c steps:" $cpustep - let a=RANK_ID*$cpustep - let b=RANK_ID+1 - let c=b*$cpustep-1 - - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune - nohup taskset -c $a-$c python3 ../bert/run_pretraining.py \ - --all_reduce_alg=nccl \ - --bert_config_file=../configs/bert_base_config.json \ - --beta_1=0.91063 \ - --beta_2=0.96497 \ - --device_warmup=False \ - --do_eval=True \ - --dtype=fp16 \ - --eval_batch_size=${eval_batch_size} \ - --train_files=${train_files_path} \ - --eval_files=${eval_files_path} \ - --learning_rate=${learning_rate} \ - --loss_scale=dynamic \ - --max_predictions_per_seq=20 \ - --max_seq_length=128 \ - --use_npu_dropout=True \ - --mixlist_file='ops_info_mul.json' \ - --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \ - --num_accumulation_steps=1 \ - --distribution_strategy=one_device \ - --num_gpus=1 \ - --num_steps_per_epoch=1000 \ - --num_train_epochs=${train_epochs} \ - --optimizer_type=lamb \ - --scale_loss=False \ - --steps_between_eval=2000 \ - --steps_per_loop=${NPU_LOOP_SIZE} \ - --stop_steps=200 \ - --enable_checkpoint_and_summary=True \ - --train_batch_size=${batch_size} \ - --verbosity=0 \ - --warmup_steps=0 \ - --precision_mode=${precision_mode} \ - --over_dump=${over_dump} \ - --over_dump_path=${over_dump_path} \ - --data_dump_flag=${data_dump_flag} \ - --data_dump_step=${data_dump_step} \ - --data_dump_path=${data_dump_path} \ - --profiling=${profiling} \ - --profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'` -FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'*'${batch_size}'}'` -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#############冒烟看护######################### -BatchSize=${batch_size} -#设备类型 -DeviceType=`uname -m` -#用例名称 -CaseName=${Network}_base_phase1_dropout_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据 -#吞吐量,不需要修改 -ActualFPS=${FPS} -#单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` - -##获取Loss -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 -grep loss $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print$11}'|grep -v instead|grep -v masked_lm_loss|sed 's/,//g'|sed '/^$/d' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - -sed -i "/AttributeError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log -sed -i "/ModuleNotFoundError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_8p_192bs_lamb_phase1.sh b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_8p_192bs_lamb_phase1.sh index 5f244af95..fe1a09167 100644 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_8p_192bs_lamb_phase1.sh +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_base_8p_192bs_lamb_phase1.sh @@ -140,7 +140,7 @@ do --max_predictions_per_seq=20 \ --max_seq_length=128 \ --mixlist_file='ops_info_mul.json' \ - --use_npu_dropout=False \ + --use_npu_dropout=True \ --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \ --num_accumulation_steps=1 \ --distribution_strategy=one_device \ -- Gitee