From 0f234d665b499c3b1bc6a2153c9ab0104962091a Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 01:31:16 +0000 Subject: [PATCH 01/15] add train_performance_distribute.sh. Signed-off-by: jieliang cai <975092674@qq.com> --- .../test/train_performance_distribute.sh | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh new file mode 100644 index 000000000..81e4db3a1 --- /dev/null +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh @@ -0,0 +1,213 @@ +#'!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=10087 +RANK_ID_START=0 + +export NPU_ENABLE_PERF=true +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge_ID0634_for_TensorFlow2.X" +#训练batch_size +batch_size=192 +eval_batch_size=16 +#训练step +train_steps=1000 +#训练epoch +train_epochs=`expr 768 / ${batch_size}` +#学习率 +learning_rate=0.000144 + +#TF2.X独有,需要模型审视修改 +export NPU_LOOP_SIZE=100 +export GE_USE_STATIC_MEMORY=1 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p_32bs.sh " + + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,需要模型审视修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +init_ckpt_path=${data_path}/'tf2_ckpt/model.ckpt-28252' #need modify to actual path +train_files_path=${data_path}/'train/*' #need modify to actual path +eval_files_path=${data_path}/'eval/eval.tfrecord' #need modify to actual path + + + +start_time=$(date +%s) +#############执行训练######################### + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +fi + +#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 +cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` +cpustep=`expr $cpucount / 8` +echo "taskset c steps:" $cpustep +let a=RANK_ID*$cpustep +let b=RANK_ID+1 +let c=b*$cpustep-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3 ../bert/run_pretraining.py \ +--all_reduce_alg=nccl \ + --bert_config_file=../configs/bert_config.json \ +--beta_1=0.91063 \ +--beta_2=0.96497 \ +--device_warmup=False \ +--do_eval=True \ +--dtype=fp16 \ +--eval_batch_size=${eval_batch_size} \ +--init_checkpoint=${init_ckpt_path} \ + --train_files=${train_files_path} \ +--eval_files=${eval_files_path} \ +--learning_rate=${learning_rate} \ +--loss_scale=dynamic \ +--max_predictions_per_seq=76 \ +--max_seq_length=512 \ +--model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \ +--num_accumulation_steps=1 \ +--distribution_strategy=one_device \ +--num_gpus=1 \ +--enable_checkpoint_and_summary=True \ + --num_steps_per_epoch=1000 \ +--num_train_epochs=${train_epochs} \ +--optimizer_type=lamb \ +--scale_loss=False \ +--steps_between_eval=100 \ +--steps_per_loop=${NPU_LOOP_SIZE} \ +--stop_steps=200 \ +--train_batch_size=${batch_size} \ +--verbosity=0 \ +--warmup_steps=0 \ +--precision_mode=${precision_mode} \ +--attention_with_dropout_v3=False \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ +--profiling=${profiling} \ +--profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#############结果处理######################### +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'*'${batch_size}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#############冒烟看护######################### +BatchSize=${batch_size} +#设备类型 +DeviceType=`uname -m` +#用例名称 +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +##获取Loss +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 +grep loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print$11}'|grep -v instead|grep -v masked_lm_loss|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +sed -i "/AttributeError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log \ No newline at end of file -- Gitee From 86049aa59c175af24206b7cd87031240852ee524 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 01:33:27 +0000 Subject: [PATCH 02/15] add train_performance_distribute.sh. Signed-off-by: jieliang cai <975092674@qq.com> --- .../test/train_performance_distribute.sh | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh diff --git a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh new file mode 100644 index 000000000..40f36e10c --- /dev/null +++ b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=10087 +RANK_ID_START=0 +export PYTHONPATH=../transformer:$PYTHONPATH + +export NPU_ENABLE_PERF=true +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="Transformer_ID0633_for_TensorFlow2.X" +#训练batch_size +batch_size=32768 +#训练step +train_steps=500 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p_49152bs.sh " + + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,需要模型审视修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +start_time=$(date +%s) +#############执行训练######################### + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +fi + +#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 +cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` +cpustep=`expr $cpucount / 8` +echo "taskset c steps:" $cpustep +let a=RANK_ID*$cpustep +let b=RANK_ID+1 +let c=b*$cpustep-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3 ../transformer/official/nlp/transformer/transformer_main.py \ +--data_dir=${data_path} \ +--model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ +--vocab_file=${data_path}/vocab.ende.32768 \ +--param_set=big \ +--train_steps=${train_steps} \ +--static_batch=true \ +--batch_size=${batch_size} \ +--steps_between_evals=100 \ +--max_length=64 \ +--mode=train \ +--decode_batch_size=32 \ +--decode_max_length=97 \ +--padded_decode=False \ +--num_gpus=1 \ +--dtype=fp16 \ +--distribution_strategy='one_device' \ +--enable_time_history=true \ +--log_steps=100 \ +--loss_scale='dynamic' \ +--precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ +--profiling=${profiling} \ +--profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#############结果处理######################### +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $8}'|tail -n +2|awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#############冒烟看护######################### +BatchSize=${batch_size} +#设备类型 +DeviceType=`uname -m` +#用例名称 +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +##获取Loss +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 +grep 'Train history' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}'| sed 's/\[//g'|sed 's/\]}//g' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 0f934393cbad1fd1cd01ae2384b6d53ee91128ab Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 01:37:43 +0000 Subject: [PATCH 03/15] add train_performance_distribute.sh. Signed-off-by: jieliang cai <975092674@qq.com> --- .../test/train_performance_distribute.sh | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh new file mode 100644 index 000000000..0b6bdf0a4 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' + +#集合通信参数,不需要修改 +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +anno_converted='/npu/traindata/COCO2017/val2017.txt' +gt_anno_path='/npu/traindata/COCO2017/annotations/instances_val2017.json' + +#屏蔽TF2.4升级到TF2.6图差异带来的性能下降 +export NPU_EXECUTE_OP_BY_ACL=false + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="YOLOv5_ID1719_for_TensorFlow2.X" + +# 训练epoch +stage1_epoch=0 +stage2_epoch=1 + +# 训练batchsize +batch_size=8 + +train_worker_num=8 + +# TF2.X独有,不需要修改 +export NPU_LOOPSIZE=1 + +# 精度模式 +precision_mode='allow_mix_precision' +#维持参数,不需要修改 +over_dump=False +over_dump_path='' +data_dump_flag=False +data_dump_path='' +data_dump_step="1" +profiling=False +autotune=False +perf=20 + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be specified" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +bind_core=1 +#进入训练脚本目录,需要模型审视修改 + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/$ASCEND_DEVICE_ID ];then + rm -rf ${cur_path}/output/$ASCEND_DEVICE_ID + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +fi +cd ${cur_path}/output/$ASCEND_DEVICE_ID/ +#执行训练脚本,需要模型审视修改 +corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` +let a=RANK_ID*${corenum}/8 +let b=RANK_ID+1 +let c=b*${corenum}/8-1 +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +#${bind_core} python3 ../../../train.py --weights='' \ +nohup ${bind_core} python3 ../../../train.py --weights='' \ + --perf=$perf \ + --model=yolov5m \ + --rank=${RANK_ID} \ + --rank_size=${RANK_SIZE} \ + --train_worker_num=${train_worker_num} \ + --data_path=${data_path} \ + --anno_converted=${anno_converted} \ + --gt_anno_path=${gt_anno_path} \ + --batch_size=${batch_size} \ + --precision_mode=${precision_mode} \ + --stage1_epoch=${stage1_epoch} \ + --stage2_epoch=${stage2_epoch} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +#输出性能FPS。需要模型审视修改 +epoch_duration=`grep epoch_duration $cur_path/output/0/train_0.log | awk '{print $2}'` +first_step=`grep duration: $cur_path/output/0/train_0.log |head -1| awk -F "duration:" '{print $2}' |sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",('$perf'+'$train_worker_num'-2)/('$epoch_duration'-'$first_step')*'$batch_size'*8}'` +echo "Final Performance imgs/sec : $FPS" + +#训练精度,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改 +# li=`cat $cur_path/output/0/train_0.log | wc -l` +# num=$(($li - 1)) +# train_accuracy=`sed -n "${num}p" $cur_path/output/0/train_0.log | awk '{print $3}'` +# echo "Final Train Accuracy : ${train_accuracy}" +#E2E训练端到端时长,直接计算,不需要修改 +echo "E2E training Duration sec: $e2e_time" + +#训练用例信息,不需要修改 +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",('$epoch_duration'-'$first_step')/('$perf'+'$train_worker_num'-2)}'` + +##获取Loss,通过train_*.log中关键字,需要根据模型审视 +grep loss $cur_path/output/0/train_0.log|awk '{print $13}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`grep total_loss: $cur_path/output/0/train_0.log | awk 'END{print $13}'` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +# echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + sed -i "/AttributeError/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log +done \ No newline at end of file -- Gitee From 403baeba4612ee953161145c2d6cb0a0fdd75210 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 01:47:57 +0000 Subject: [PATCH 04/15] add train_ID0060_BertBase_performance_distribute.sh. Signed-off-by: jieliang cai <975092674@qq.com> --- ..._ID0060_BertBase_performance_distribute.sh | 175 ++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh new file mode 100644 index 000000000..a8906a5df --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh @@ -0,0 +1,175 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=99990001 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Bert-base_ID0060_for_TensorFlow" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=128 +#训练step +train_steps=1000 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID} +fi + + # 绑核,不需要的绑核的模型删除,需要模型审视修改 +corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` +let a=RANK_ID*${corenum}/${RANK_SIZE} +let b=RANK_ID+1 +let c=b*${corenum}/${RANK_SIZE}-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3.7 $cur_path/../src/run_pretraining.py --bert_config_file=${cur_path}/../configs/bert_base_config.json \ +--max_seq_length=128 \ +--max_predictions_per_seq=20 \ +--train_batch_size=${batch_size} \ +--learning_rate=1e-4 \ +--num_warmup_steps=0 \ +--num_train_steps=${train_steps} \ +--optimizer_type=adam \ +--manual_fp16=True \ +--use_fp16_cls=True \ +--input_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/training \ +--eval_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/test \ +--npu_bert_debug=False \ +--npu_bert_use_tdt=True \ +--do_train=True \ +--num_accumulation_steps=1 \ +--npu_bert_job_start_file= \ +--iterations_per_loop=100 \ +--save_checkpoints_steps=1000 \ +--npu_bert_clip_by_global_norm=False \ +--distributed=True \ +--npu_bert_tail_optimize=True \ +--npu_bert_loss_scale=0 \ +--output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +ActualFPS=`grep Throughput ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END {print $6}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}' * '${RANK_SIZE}' / '${ActualFPS}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $ActualFPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "tensorflow:loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss = " '{print $2}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From b39de3acd43ecfb88bf5cea39c0dea018bf4199e Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 01:51:15 +0000 Subject: [PATCH 05/15] add train_ID0495_Bert-Squad_performance_distribute.sh. Signed-off-by: jieliang cai <975092674@qq.com> --- ...D0495_Bert-Squad_performance_distribute.sh | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh new file mode 100644 index 000000000..8ef5eae04 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh @@ -0,0 +1,144 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` +parent_path=$(dirname $(pwd)) + +#集合通信参数,不需要修改 +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="Bertsquad_ID0495_for_TensorFlow" +batch_size=32 +epoch=1 + +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +vocab_file=${data_path}/model/vocab.txt +bert_config_file=${data_path}/model/bert_config.json +init_checkpoint=${data_path}/model/bert_model.ckpt +train_file=${data_path}/dataset/train-v1.1_small.json +predict_file=${data_path}/dataset/dev-v1.1.json + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +#创建DeviceID输出目录,不需要修改 +if [ -d $cur_path/output/$ASCEND_DEVICE_ID ];then + rm -rf $cur_path/output/$ASCEND_DEVICE_ID + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +fi + +#执行训练脚本,需要模型审视修改 +nohup python3.7 ${parent_path}/run_squad.py \ + --vocab_file=$vocab_file \ + --bert_config_file=$bert_config_file \ + --init_checkpoint=$init_checkpoint \ + --train_file=$train_file \ + --do_predict=True \ + --do_train=True \ + --predict_file=$predict_file \ + --train_batch_size=${batch_size} \ + --num_train_epochs=${epoch} \ + --num_train_steps=1000 \ + --learning_rate=3e-5 \ + --max_seq_length=384 \ + --doc_stride=128 \ + --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#E2E训练端到端时长,直接计算,不需要修改 +echo "E2E training Duration sec: $e2e_time" + +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +#获取性能数据 +step_per_sec=`grep "global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'global_step/sec:' '{print $2}'|awk 'END {print $1}'` +ActualFPS=`awk 'BEGIN {printf "%.2f\n", '${step_per_sec}' * '${batch_size}' * '${RANK_SIZE}'}'` +TrainingTime=`awk 'BEGIN {printf "%.2f\n", '8000' * '${batch_size}' / '${ActualFPS}'}'` + +ActualLoss=`grep "loss =" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'loss =' '{print $2}'|awk 'END {print $1}'|tr -d ,` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +sed -i -e '/ModuleNotFoundError/d' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log \ No newline at end of file -- Gitee From a8f5feb8f3ba63cd9dab7d28362ac24b1a540c50 Mon Sep 17 00:00:00 2001 From: caijieliang <975092674@qq.com> Date: Fri, 30 Sep 2022 14:08:11 +0800 Subject: [PATCH 06/15] modify --- ...D0495_Bert-Squad_performance_distribute.sh | 288 ++++++------ ..._ID0060_BertBase_performance_distribute.sh | 350 +++++++------- .../test/train_performance_distribute.sh | 426 +++++++++--------- .../test/train_performance_distribute.sh | 370 +++++++-------- 4 files changed, 717 insertions(+), 717 deletions(-) diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh index 8ef5eae04..b5c811e34 100644 --- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh @@ -1,144 +1,144 @@ -#!/bin/bash -#当前路径,不需要修改 -cur_path=`pwd` -parent_path=$(dirname $(pwd)) - -#集合通信参数,不需要修改 -#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -export JOB_ID=10087 -RANK_ID_START=0 - -# 数据集路径,保持为空,不需要修改 -data_path="" - -#基础参数 需要模型审视修改 -#网络名称,同目录名称 -Network="Bertsquad_ID0495_for_TensorFlow" -batch_size=32 -epoch=1 - -#维持参数,不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_8p.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - - -vocab_file=${data_path}/model/vocab.txt -bert_config_file=${data_path}/model/bert_config.json -init_checkpoint=${data_path}/model/bert_model.ckpt -train_file=${data_path}/dataset/train-v1.1_small.json -predict_file=${data_path}/dataset/dev-v1.1.json - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 - -#设置环境变量,不需要修改 -echo "Device ID: $RANK_ID" - -#创建DeviceID输出目录,不需要修改 -if [ -d $cur_path/output/$ASCEND_DEVICE_ID ];then - rm -rf $cur_path/output/$ASCEND_DEVICE_ID - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt -else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt -fi - -#执行训练脚本,需要模型审视修改 -nohup python3.7 ${parent_path}/run_squad.py \ - --vocab_file=$vocab_file \ - --bert_config_file=$bert_config_file \ - --init_checkpoint=$init_checkpoint \ - --train_file=$train_file \ - --do_predict=True \ - --do_train=True \ - --predict_file=$predict_file \ - --train_batch_size=${batch_size} \ - --num_train_epochs=${epoch} \ - --num_train_steps=1000 \ - --learning_rate=3e-5 \ - --max_seq_length=384 \ - --doc_stride=128 \ - --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#E2E训练端到端时长,直接计算,不需要修改 -echo "E2E training Duration sec: $e2e_time" - -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -#获取性能数据 -step_per_sec=`grep "global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'global_step/sec:' '{print $2}'|awk 'END {print $1}'` -ActualFPS=`awk 'BEGIN {printf "%.2f\n", '${step_per_sec}' * '${batch_size}' * '${RANK_SIZE}'}'` -TrainingTime=`awk 'BEGIN {printf "%.2f\n", '8000' * '${batch_size}' / '${ActualFPS}'}'` - -ActualLoss=`grep "loss =" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'loss =' '{print $2}'|awk 'END {print $1}'|tr -d ,` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -sed -i -e '/ModuleNotFoundError/d' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log \ No newline at end of file +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` +parent_path=$(dirname $(pwd)) + +#集合通信参数,不需要修改 +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="Bertsquad_ID0495_for_TensorFlow" +batch_size=32 +epoch=1 + +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +vocab_file=${data_path}/model/vocab.txt +bert_config_file=${data_path}/model/bert_config.json +init_checkpoint=${data_path}/model/bert_model.ckpt +train_file=${data_path}/dataset/train-v1.1_small.json +predict_file=${data_path}/dataset/dev-v1.1.json + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +#创建DeviceID输出目录,不需要修改 +if [ -d $cur_path/output/$ASCEND_DEVICE_ID ];then + rm -rf $cur_path/output/$ASCEND_DEVICE_ID + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +fi + +#执行训练脚本,需要模型审视修改 +nohup python3.7 ${parent_path}/run_squad.py \ + --vocab_file=$vocab_file \ + --bert_config_file=$bert_config_file \ + --init_checkpoint=$init_checkpoint \ + --train_file=$train_file \ + --do_predict=True \ + --do_train=True \ + --predict_file=$predict_file \ + --train_batch_size=${batch_size} \ + --num_train_epochs=${epoch} \ + --num_train_steps=1000 \ + --learning_rate=3e-5 \ + --max_seq_length=384 \ + --doc_stride=128 \ + --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#E2E训练端到端时长,直接计算,不需要修改 +echo "E2E training Duration sec: $e2e_time" + +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +#获取性能数据 +step_per_sec=`grep "global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'global_step/sec:' '{print $2}'|awk 'END {print $1}'` +ActualFPS=`awk 'BEGIN {printf "%.2f\n", '${step_per_sec}' * '${batch_size}' * '${RANK_SIZE}'}'` +TrainingTime=`awk 'BEGIN {printf "%.2f\n", '8000' * '${batch_size}' / '${ActualFPS}'}'` + +ActualLoss=`grep "loss =" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'loss =' '{print $2}'|awk 'END {print $1}'|tr -d ,` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +sed -i -e '/ModuleNotFoundError/d' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh index a8906a5df..63e153395 100644 --- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh @@ -1,175 +1,175 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 -export JOB_ID=99990001 -RANK_ID_START=0 - -# 数据集路径,保持为空,不需要修改 -data_path="" - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="Bert-base_ID0060_for_TensorFlow" -#训练epoch -train_epochs=1 -#训练batch_size -batch_size=128 -#训练step -train_steps=1000 -#学习率 -learning_rate= - -#维测参数,precision_mode需要模型审视修改 -#precision_mode="allow_mix_precision" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False -autotune=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_1p.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 - -#设置环境变量,不需要修改 -echo "Device ID: $RANK_ID" - -#创建DeviceID输出目录,不需要修改 -if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID} -else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID} -fi - - # 绑核,不需要的绑核的模型删除,需要模型审视修改 -corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` -let a=RANK_ID*${corenum}/${RANK_SIZE} -let b=RANK_ID+1 -let c=b*${corenum}/${RANK_SIZE}-1 - -#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path -if [ "x${bind_core}" != x ];then - bind_core="taskset -c $a-$c" -fi -nohup ${bind_core} python3.7 $cur_path/../src/run_pretraining.py --bert_config_file=${cur_path}/../configs/bert_base_config.json \ ---max_seq_length=128 \ ---max_predictions_per_seq=20 \ ---train_batch_size=${batch_size} \ ---learning_rate=1e-4 \ ---num_warmup_steps=0 \ ---num_train_steps=${train_steps} \ ---optimizer_type=adam \ ---manual_fp16=True \ ---use_fp16_cls=True \ ---input_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/training \ ---eval_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/test \ ---npu_bert_debug=False \ ---npu_bert_use_tdt=True \ ---do_train=True \ ---num_accumulation_steps=1 \ ---npu_bert_job_start_file= \ ---iterations_per_loop=100 \ ---save_checkpoints_steps=1000 \ ---npu_bert_clip_by_global_norm=False \ ---distributed=True \ ---npu_bert_tail_optimize=True \ ---npu_bert_loss_scale=0 \ ---output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -ActualFPS=`grep Throughput ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END {print $6}'` -TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}' * '${RANK_SIZE}' / '${ActualFPS}'}'` -#打印,不需要修改 -echo "Final Performance images/sec : $ActualFPS" - -#输出训练精度,需要模型审视修改 -#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` -#打印,不需要修改 -#echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#稳定性精度看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep "tensorflow:loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss = " '{print $2}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=99990001 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Bert-base_ID0060_for_TensorFlow" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=128 +#训练step +train_steps=1000 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID} +fi + + # 绑核,不需要的绑核的模型删除,需要模型审视修改 +corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` +let a=$ASCEND_DEVICE_ID*${corenum}/${RANK_SIZE} +let b=$ASCEND_DEVICE_ID+1 +let c=b*${corenum}/${RANK_SIZE}-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3.7 $cur_path/../src/run_pretraining.py --bert_config_file=${cur_path}/../configs/bert_base_config.json \ +--max_seq_length=128 \ +--max_predictions_per_seq=20 \ +--train_batch_size=${batch_size} \ +--learning_rate=1e-4 \ +--num_warmup_steps=0 \ +--num_train_steps=${train_steps} \ +--optimizer_type=adam \ +--manual_fp16=True \ +--use_fp16_cls=True \ +--input_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/training \ +--eval_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/test \ +--npu_bert_debug=False \ +--npu_bert_use_tdt=True \ +--do_train=True \ +--num_accumulation_steps=1 \ +--npu_bert_job_start_file= \ +--iterations_per_loop=100 \ +--save_checkpoints_steps=1000 \ +--npu_bert_clip_by_global_norm=False \ +--distributed=True \ +--npu_bert_tail_optimize=True \ +--npu_bert_loss_scale=0 \ +--output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +ActualFPS=`grep Throughput ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END {print $6}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}' * '${RANK_SIZE}' / '${ActualFPS}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $ActualFPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "tensorflow:loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss = " '{print $2}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh index 81e4db3a1..ace12437e 100644 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh @@ -1,213 +1,213 @@ -#'!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 -export JOB_ID=10087 -RANK_ID_START=0 - -export NPU_ENABLE_PERF=true -# 数据集路径,保持为空,不需要修改 -data_path="" - -#基础参数 需要模型审视修改 -#网络名称,同目录名称 -Network="BertLarge_ID0634_for_TensorFlow2.X" -#训练batch_size -batch_size=192 -eval_batch_size=16 -#训练step -train_steps=1000 -#训练epoch -train_epochs=`expr 768 / ${batch_size}` -#学习率 -learning_rate=0.000144 - -#TF2.X独有,需要模型审视修改 -export NPU_LOOP_SIZE=100 -export GE_USE_STATIC_MEMORY=1 - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" -#维持参数,不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_8p_32bs.sh " - - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,需要模型审视修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - elif [[ $para == --bind_core* ]]; then - bind_core=`echo ${para#*=}` - name_bind="_bindcore" - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -init_ckpt_path=${data_path}/'tf2_ckpt/model.ckpt-28252' #need modify to actual path -train_files_path=${data_path}/'train/*' #need modify to actual path -eval_files_path=${data_path}/'eval/eval.tfrecord' #need modify to actual path - - - -start_time=$(date +%s) -#############执行训练######################### - -#设置环境变量,不需要修改 -echo "Device ID: $RANK_ID" - -if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} -else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} -fi - -#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 -cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` -cpustep=`expr $cpucount / 8` -echo "taskset c steps:" $cpustep -let a=RANK_ID*$cpustep -let b=RANK_ID+1 -let c=b*$cpustep-1 - -#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune -if [ "x${bind_core}" != x ];then - bind_core="taskset -c $a-$c" -fi -nohup ${bind_core} python3 ../bert/run_pretraining.py \ ---all_reduce_alg=nccl \ - --bert_config_file=../configs/bert_config.json \ ---beta_1=0.91063 \ ---beta_2=0.96497 \ ---device_warmup=False \ ---do_eval=True \ ---dtype=fp16 \ ---eval_batch_size=${eval_batch_size} \ ---init_checkpoint=${init_ckpt_path} \ - --train_files=${train_files_path} \ ---eval_files=${eval_files_path} \ ---learning_rate=${learning_rate} \ ---loss_scale=dynamic \ ---max_predictions_per_seq=76 \ ---max_seq_length=512 \ ---model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \ ---num_accumulation_steps=1 \ ---distribution_strategy=one_device \ ---num_gpus=1 \ ---enable_checkpoint_and_summary=True \ - --num_steps_per_epoch=1000 \ ---num_train_epochs=${train_epochs} \ ---optimizer_type=lamb \ ---scale_loss=False \ ---steps_between_eval=100 \ ---steps_per_loop=${NPU_LOOP_SIZE} \ ---stop_steps=200 \ ---train_batch_size=${batch_size} \ ---verbosity=0 \ ---warmup_steps=0 \ ---precision_mode=${precision_mode} \ ---attention_with_dropout_v3=False \ - --over_dump=${over_dump} \ - --over_dump_path=${over_dump_path} \ - --data_dump_flag=${data_dump_flag} \ - --data_dump_step=${data_dump_step} \ - --data_dump_path=${data_dump_path} \ ---profiling=${profiling} \ ---profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#############结果处理######################### -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'` -FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'*'${batch_size}'}'` -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#############冒烟看护######################### -BatchSize=${batch_size} -#设备类型 -DeviceType=`uname -m` -#用例名称 -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据 -#吞吐量,不需要修改 -ActualFPS=${FPS} -#单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` - -##获取Loss -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 -grep loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print$11}'|grep -v instead|grep -v masked_lm_loss|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - -sed -i "/AttributeError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log \ No newline at end of file +#'!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=10087 +RANK_ID_START=0 + +export NPU_ENABLE_PERF=true +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge_ID0634_for_TensorFlow2.X" +#训练batch_size +batch_size=192 +eval_batch_size=16 +#训练step +train_steps=1000 +#训练epoch +train_epochs=`expr 768 / ${batch_size}` +#学习率 +learning_rate=0.000144 + +#TF2.X独有,需要模型审视修改 +export NPU_LOOP_SIZE=100 +export GE_USE_STATIC_MEMORY=1 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p_32bs.sh " + + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,需要模型审视修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +init_ckpt_path=${data_path}/'tf2_ckpt/model.ckpt-28252' #need modify to actual path +train_files_path=${data_path}/'train/*' #need modify to actual path +eval_files_path=${data_path}/'eval/eval.tfrecord' #need modify to actual path + + + +start_time=$(date +%s) +#############执行训练######################### + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +fi + +#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 +cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` +cpustep=`expr $cpucount / 8` +echo "taskset c steps:" $cpustep +let a=$ASCEND_DEVICE_ID*$cpustep +let b=$ASCEND_DEVICE_ID+1 +let c=b*$cpustep-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3 ../bert/run_pretraining.py \ +--all_reduce_alg=nccl \ + --bert_config_file=../configs/bert_config.json \ +--beta_1=0.91063 \ +--beta_2=0.96497 \ +--device_warmup=False \ +--do_eval=True \ +--dtype=fp16 \ +--eval_batch_size=${eval_batch_size} \ +--init_checkpoint=${init_ckpt_path} \ + --train_files=${train_files_path} \ +--eval_files=${eval_files_path} \ +--learning_rate=${learning_rate} \ +--loss_scale=dynamic \ +--max_predictions_per_seq=76 \ +--max_seq_length=512 \ +--model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \ +--num_accumulation_steps=1 \ +--distribution_strategy=one_device \ +--num_gpus=1 \ +--enable_checkpoint_and_summary=True \ + --num_steps_per_epoch=1000 \ +--num_train_epochs=${train_epochs} \ +--optimizer_type=lamb \ +--scale_loss=False \ +--steps_between_eval=100 \ +--steps_per_loop=${NPU_LOOP_SIZE} \ +--stop_steps=200 \ +--train_batch_size=${batch_size} \ +--verbosity=0 \ +--warmup_steps=0 \ +--precision_mode=${precision_mode} \ +--attention_with_dropout_v3=False \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ +--profiling=${profiling} \ +--profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#############结果处理######################### +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'*'${batch_size}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#############冒烟看护######################### +BatchSize=${batch_size} +#设备类型 +DeviceType=`uname -m` +#用例名称 +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +##获取Loss +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 +grep loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print$11}'|grep -v instead|grep -v masked_lm_loss|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +sed -i "/AttributeError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log diff --git a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh index 40f36e10c..d759a2205 100644 --- a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh +++ b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh @@ -1,185 +1,185 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 -export JOB_ID=10087 -RANK_ID_START=0 -export PYTHONPATH=../transformer:$PYTHONPATH - -export NPU_ENABLE_PERF=true -# 数据集路径,保持为空,不需要修改 -data_path="" - -#基础参数 需要模型审视修改 -#网络名称,同目录名称 -Network="Transformer_ID0633_for_TensorFlow2.X" -#训练batch_size -batch_size=32768 -#训练step -train_steps=500 - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" -#维持参数,不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_8p_49152bs.sh " - - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,需要模型审视修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - elif [[ $para == --bind_core* ]]; then - bind_core=`echo ${para#*=}` - name_bind="_bindcore" - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -start_time=$(date +%s) -#############执行训练######################### - -#设置环境变量,不需要修改 -echo "Device ID: $RANK_ID" - -if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} -else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} -fi - -#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 -cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` -cpustep=`expr $cpucount / 8` -echo "taskset c steps:" $cpustep -let a=RANK_ID*$cpustep -let b=RANK_ID+1 -let c=b*$cpustep-1 - -#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune -if [ "x${bind_core}" != x ];then - bind_core="taskset -c $a-$c" -fi -nohup ${bind_core} python3 ../transformer/official/nlp/transformer/transformer_main.py \ ---data_dir=${data_path} \ ---model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ ---vocab_file=${data_path}/vocab.ende.32768 \ ---param_set=big \ ---train_steps=${train_steps} \ ---static_batch=true \ ---batch_size=${batch_size} \ ---steps_between_evals=100 \ ---max_length=64 \ ---mode=train \ ---decode_batch_size=32 \ ---decode_max_length=97 \ ---padded_decode=False \ ---num_gpus=1 \ ---dtype=fp16 \ ---distribution_strategy='one_device' \ ---enable_time_history=true \ ---log_steps=100 \ ---loss_scale='dynamic' \ ---precision_mode=${precision_mode} \ - --over_dump=${over_dump} \ - --over_dump_path=${over_dump_path} \ - --data_dump_flag=${data_dump_flag} \ - --data_dump_step=${data_dump_step} \ - --data_dump_path=${data_dump_path} \ ---profiling=${profiling} \ ---profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#############结果处理######################### -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $8}'|tail -n +2|awk '{sum+=$1} END {print sum/NR}'` -FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'}'` -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#############冒烟看护######################### -BatchSize=${batch_size} -#设备类型 -DeviceType=`uname -m` -#用例名称 -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据 -#吞吐量,不需要修改 -ActualFPS=${FPS} -#单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` - -##获取Loss -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 -grep 'Train history' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}'| sed 's/\[//g'|sed 's/\]}//g' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=10087 +RANK_ID_START=0 +export PYTHONPATH=../transformer:$PYTHONPATH + +export NPU_ENABLE_PERF=true +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="Transformer_ID0633_for_TensorFlow2.X" +#训练batch_size +batch_size=32768 +#训练step +train_steps=500 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p_49152bs.sh " + + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,需要模型审视修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +start_time=$(date +%s) +#############执行训练######################### + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +fi + +#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 +cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` +cpustep=`expr $cpucount / 8` +echo "taskset c steps:" $cpustep +let a=$ASCEND_DEVICE_ID*$cpustep +let b=$ASCEND_DEVICE_ID+1 +let c=b*$cpustep-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3 ../transformer/official/nlp/transformer/transformer_main.py \ +--data_dir=${data_path} \ +--model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ +--vocab_file=${data_path}/vocab.ende.32768 \ +--param_set=big \ +--train_steps=${train_steps} \ +--static_batch=true \ +--batch_size=${batch_size} \ +--steps_between_evals=100 \ +--max_length=64 \ +--mode=train \ +--decode_batch_size=32 \ +--decode_max_length=97 \ +--padded_decode=False \ +--num_gpus=1 \ +--dtype=fp16 \ +--distribution_strategy='one_device' \ +--enable_time_history=true \ +--log_steps=100 \ +--loss_scale='dynamic' \ +--precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ +--profiling=${profiling} \ +--profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#############结果处理######################### +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $8}'|tail -n +2|awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#############冒烟看护######################### +BatchSize=${batch_size} +#设备类型 +DeviceType=`uname -m` +#用例名称 +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +##获取Loss +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 +grep 'Train history' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}'| sed 's/\[//g'|sed 's/\]}//g' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 1b8f1f4337cb4dd28f9d6b991de09251ac862998 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 06:18:39 +0000 Subject: [PATCH 07/15] update TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../README.md | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md index 9ff988e59..38580f02c 100644 --- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md @@ -142,6 +142,8 @@ BERT是一种与训练语言表示的方法,这意味着我们在大型文本 将环境变量配置到test/train_*.sh中 +#### 模型训练 + - 单卡训练 启动单卡训练 @@ -164,6 +166,26 @@ BERT是一种与训练语言表示的方法,这意味着我们在大型文本 bash train_ID0495_Bert-Squad_performance_8p.sh ``` +##### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_ID0495_Bert-Squad_performance_distribute.sh`, 该脚本由`.test/train_ID0495_Bert-Squad_performance_1p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_ID0495_Bert-Squad_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_ID0495_Bert-Squad_performance_distribute.sh --data_path=/npu/traindata" +```

高级参考

-- Gitee From 4c2afcd6dc36a9bb63b1738f1a2a7862f151acf3 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 06:22:47 +0000 Subject: [PATCH 08/15] update TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../BertNV_Series_for_TensorFlow/README.md | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md index 4b9bb0dc6..fb1c45efb 100644 --- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md @@ -151,6 +151,8 @@ python3 ${work_path}/src/utils/create_squad_data.py --train_file=${data_path}/tr ## 模型训练 +#### 模型训练 + - 单击“立即下载”,并选择合适的下载方式下载源码包。 - 开始训练。 @@ -226,6 +228,26 @@ python3 ${work_path}/src/utils/create_squad_data.py --train_file=${data_path}/tr bash train_ID3220_BertLarge-Squad2.0_performance_1p.sh --data_path=/home ``` +##### 分布式插件使能分布式 + +ID0060网络分布式统一训练脚本`./test/train_ID0060_BertBase_performance_distribute.sh`, 该脚本由`./test/train_ID0060_BertBase_performance_8p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_ID0060_BertBase_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_ID0060_BertBase_performance_distribute.sh --data_path=/npu/traindata" +```

高级参考

-- Gitee From 1045f30f6a3ebf3034661f621eb9669b5ac0459e Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 06:23:39 +0000 Subject: [PATCH 09/15] update TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md. Signed-off-by: jieliang cai <975092674@qq.com> --- TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md index fb1c45efb..40d769044 100644 --- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md @@ -228,7 +228,7 @@ python3 ${work_path}/src/utils/create_squad_data.py --train_file=${data_path}/tr bash train_ID3220_BertLarge-Squad2.0_performance_1p.sh --data_path=/home ``` -##### 分布式插件使能分布式 +#### 分布式插件使能分布式 ID0060网络分布式统一训练脚本`./test/train_ID0060_BertBase_performance_distribute.sh`, 该脚本由`./test/train_ID0060_BertBase_performance_8p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 -- Gitee From 21de539fae888b169ea460141dc993678de18d16 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 06:24:09 +0000 Subject: [PATCH 10/15] update TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md index 38580f02c..c31b0b599 100644 --- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md @@ -166,7 +166,7 @@ BERT是一种与训练语言表示的方法,这意味着我们在大型文本 bash train_ID0495_Bert-Squad_performance_8p.sh ``` -##### 分布式插件使能分布式 +#### 分布式插件使能分布式 分布式统一训练脚本`./test/train_ID0495_Bert-Squad_performance_distribute.sh`, 该脚本由`.test/train_ID0495_Bert-Squad_performance_1p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 -- Gitee From 1dbb083daca297e7cfaf0b954ab4652d1bcbc3c1 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 06:26:35 +0000 Subject: [PATCH 11/15] update TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../YOLOv5_ID1719_for_TensorFlow2.X/README.md | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md index 00b2d8897..8ef66fccc 100644 --- a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md @@ -128,6 +128,9 @@ npu_device.global_options().precision_mode = 'allow_mix_precision' 2. 数据集标注文件需要先后使用scripts目录下coco_convert.py及coco_annotation.py生成。标注文件生成后即内含图片路径及box信息,故数据集图片文件不可随意移动位置。 ## 模型训练 + +#### 模型训练 + - 单击“立即下载”,并选择合适的下载方式下载源码包。 - 开始训练。 @@ -168,6 +171,27 @@ npu_device.global_options().precision_mode = 'allow_mix_precision' ├─val2017.txt ``` +#### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` +

迁移学习指导

- 数据集准备。 -- Gitee From debcbfcf85a280aec4a1bbd28a5729fecd0e09a5 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 06:29:32 +0000 Subject: [PATCH 12/15] update TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../ReadMe.md | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md index b9000ac78..e8745d95a 100644 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md @@ -176,6 +176,9 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out 文件夹路径需要自己创建。 ## 模型训练 + +#### 模型训练 + - 下载训练脚本。 - 检查并修改configs/目录下8卡IP的json配置文件“rank_table_8p.json"。 @@ -261,6 +264,27 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out 4.1 含pack策略的训练脚本(./test/目录下名字带有"_packed"的脚本即为相应包含pack策略的训练脚本) 使用pack策略进行训练时,需使用pack过后的数据集(train、eval)及对应的预训练模型。若无对应tensorflow-v2版本packed预训练模型,可由tensorflow-v1版本进行转换得来。模型转换相关脚本为bert/tf2_encoder_checkpoint_converter.py,详见:脚本和事例代码 - 模型转换脚本 +#### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p_192bs.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` +

高级参考

## 脚本和事例代码 -- Gitee From c1afd67bcc94f555e07c95249b467c8b0af2c390 Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Fri, 30 Sep 2022 06:33:33 +0000 Subject: [PATCH 13/15] update TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../ReadMe.md | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md index 83e2cd718..4ba84ba7c 100644 --- a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md +++ b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md @@ -163,6 +163,9 @@ npu_device.global_options().precision_mode=FLAGS.precision_mode ## 模型训练 + +#### 模型训练 + - 下载训练脚本。 - 检查scripts/目录下是否有存在8卡IP的json配置文件“rank_table_8p.json"。 @@ -243,7 +246,26 @@ npu_device.global_options().precision_mode=FLAGS.precision_mode train_performance_8p_49152bs_static_noeval.sh --data_path=${Data_Path} +#### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p_32768bs_static_noeval.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +```

高级参考

-- Gitee From 9e6a270d79547781eab4021385cc7caff1bbee5b Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Sat, 8 Oct 2022 02:19:36 +0000 Subject: [PATCH 14/15] update TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../ReadMe.md | 34 +++---------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md index e8745d95a..12559b015 100644 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md @@ -4,7 +4,7 @@ - [快速上手](#快速上手.md) - [迁移学习指导](#迁移学习指导.md) - [高级参考](#高级参考.md) -

基本信息

+## 基本信息 **发布者(Publisher):Huawei** @@ -28,7 +28,7 @@ **描述(Description):基于TensorFlow框架的BertLarge自然语言处理网络训练代码** -

概述

+## 概述 ## 简述 @@ -109,7 +109,7 @@ flags.DEFINE_string(name='precision_mode', default= 'allow_fp32_to_fp16', npu_device.global_options().precision_mode=FLAGS.precision_mode ``` -

训练环境准备

+## 训练环境准备 1. 硬件环境准备请参见各硬件产品文档"[驱动和固件安装升级指南]( https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909)"。需要在硬件设备上安装与CANN版本配套的固件与驱动。 2. 宿主机上需要安装Docker并登录[Ascend Hub中心](https://ascendhub.huawei.com/#/detail?name=ascend-tensorflow-arm)获取镜像。 @@ -139,7 +139,7 @@ npu_device.global_options().precision_mode=FLAGS.precision_mode -

快速上手

+## 快速上手 ## 数据集准备 @@ -176,9 +176,6 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out 文件夹路径需要自己创建。 ## 模型训练 - -#### 模型训练 - - 下载训练脚本。 - 检查并修改configs/目录下8卡IP的json配置文件“rank_table_8p.json"。 @@ -264,28 +261,7 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out 4.1 含pack策略的训练脚本(./test/目录下名字带有"_packed"的脚本即为相应包含pack策略的训练脚本) 使用pack策略进行训练时,需使用pack过后的数据集(train、eval)及对应的预训练模型。若无对应tensorflow-v2版本packed预训练模型,可由tensorflow-v1版本进行转换得来。模型转换相关脚本为bert/tf2_encoder_checkpoint_converter.py,详见:脚本和事例代码 - 模型转换脚本 -#### 分布式插件使能分布式 - -分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p_192bs.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 - -训练前请下载工具并根据说明完成配置 - -工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute - - -- 8p训练 -``` -python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" -``` - - -- 16p训练 - -``` -python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" -``` - -

高级参考

+## 高级参考 ## 脚本和事例代码 -- Gitee From 4b04ab455bd0d7801224df5e321640481bde9f8d Mon Sep 17 00:00:00 2001 From: jieliang cai <975092674@qq.com> Date: Sat, 8 Oct 2022 02:21:03 +0000 Subject: [PATCH 15/15] update TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md. Signed-off-by: jieliang cai <975092674@qq.com> --- .../ReadMe.md | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md index 12559b015..f6dc9e204 100644 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md @@ -176,6 +176,9 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out 文件夹路径需要自己创建。 ## 模型训练 + +#### 模型训练 + - 下载训练脚本。 - 检查并修改configs/目录下8卡IP的json配置文件“rank_table_8p.json"。 @@ -261,6 +264,27 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out 4.1 含pack策略的训练脚本(./test/目录下名字带有"_packed"的脚本即为相应包含pack策略的训练脚本) 使用pack策略进行训练时,需使用pack过后的数据集(train、eval)及对应的预训练模型。若无对应tensorflow-v2版本packed预训练模型,可由tensorflow-v1版本进行转换得来。模型转换相关脚本为bert/tf2_encoder_checkpoint_converter.py,详见:脚本和事例代码 - 模型转换脚本 +#### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p_192bs.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` + ## 高级参考 ## 脚本和事例代码 -- Gitee