From f5190c4d26c1afff2929536d4f8aee63fe506126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Spencer=E5=85=94=E5=AD=90?= <11326804+spencer-rabbit@user.noreply.gitee.com> Date: Fri, 30 Dec 2022 06:22:28 +0000 Subject: [PATCH 1/4] =?UTF-8?q?add=20DeepCTR=5FSeries=5Ffor=5FTensorFlow/t?= =?UTF-8?q?est/train=5FRT2=5FID4032=5FDCNMix=5Ffull=5F1p.sh.=20=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0train=5FRT2=5FID4032=5FDCNMix=5Ffull=5F1p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Spencer兔子 <11326804+spencer-rabbit@user.noreply.gitee.com> --- .../test/train_RT2_ID4032_DCNMix_full_1p.sh | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_RT2_ID4032_DCNMix_full_1p.sh diff --git a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_RT2_ID4032_DCNMix_full_1p.sh b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_RT2_ID4032_DCNMix_full_1p.sh new file mode 100644 index 000000000..e108bb58f --- /dev/null +++ b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_RT2_ID4032_DCNMix_full_1p.sh @@ -0,0 +1,152 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 +# 数据集路径,保持为空,不需要修改 +data_path="" +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="DCNMix_ID4032_for_TensorFlow" +#训练epoch +train_epochs=4 +#训练batch_size +batch_size=10240 +#训练step +train_steps=16115 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_fp32_to_fp16" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../examples + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup python3 run_dcnmix.py \ + --data_path=${data_path} \ + --train_batch_size=${batch_size} \ + --eval_batch_size=${batch_size} \ + --num_epochs=${train_epochs} \ + --max_steps=${train_steps} \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +# #输出性能FPS,需要模型审视修改 +fps=`grep "examples\/sec" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $2}'|tail -n +5 | awk 'NR>1{print p}{p=$0}'|awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${fps}'}'` +# #打印,不需要修改 +echo "Final Performance item/sec : $FPS" + +# #输出训练精度,需要模型审视修改 +train_accuracy=`grep "AUC = " $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END {print $9}' | sed 's/,//g'` +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'accu' + +##获取性能数据,不需要修改 +#吞吐量 +TrainingTime=`awk 'BEGIN{printf "%.6f\n",'${BatchSize}'/'${FPS}'}'` + +ActualFPS=${FPS} +grep ":loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $3}' | sed 's/,//g'| sed -n '1~2p' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From b65552e896cb33638f3572eba86a092fcfc1f355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Spencer=E5=85=94=E5=AD=90?= <11326804+spencer-rabbit@user.noreply.gitee.com> Date: Fri, 30 Dec 2022 06:57:31 +0000 Subject: [PATCH 2/4] =?UTF-8?q?add=20Transformer=5FID0633=5Ffor=5FTensorFl?= =?UTF-8?q?ow2.X/test/train=5FRT2=5Ffull=5F1p=5F6144bs=5Fdynamic=5Fnoeval.?= =?UTF-8?q?sh.=20=E5=A2=9E=E5=8A=A0train=5FRT2=5Ffull=5F1p=5F6144bs=5Fdyna?= =?UTF-8?q?mic=5Fnoeval.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Spencer兔子 <11326804+spencer-rabbit@user.noreply.gitee.com> --- ...train_RT2_full_1p_6144bs_dynamic_noeval.sh | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_RT2_full_1p_6144bs_dynamic_noeval.sh diff --git a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_RT2_full_1p_6144bs_dynamic_noeval.sh b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_RT2_full_1p_6144bs_dynamic_noeval.sh new file mode 100644 index 000000000..4d556a4ee --- /dev/null +++ b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_RT2_full_1p_6144bs_dynamic_noeval.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +export RANK_ID_START=0 +export PYTHONPATH=../transformer:$PYTHONPATH + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Transformer_ID3215_for_TensorFlow2.X" +#训练batch_size +batch_size=6144 +#训练step +train_steps=250000 + +#TF2.X独有,不需要修改 +#export NPU_ENABLE_PERF=true + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} + fi + + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + cpustep=`expr $cpucount / 8` + echo "taskset c steps:" $cpustep + let a=RANK_ID*$cpustep + let b=RANK_ID+1 + let c=b*$cpustep-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + nohup taskset -c $a-$c python3 ../transformer/official/nlp/transformer/transformer_main.py \ + --data_dir=${data_path} \ + --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ + --vocab_file=${data_path}/vocab.ende.32768 \ + --param_set=big \ + --train_steps=${train_steps} \ + --batch_size=${batch_size} \ + --steps_between_evals=10000 \ + --max_length=64 \ + --mode=train \ + --decode_batch_size=32 \ + --decode_max_length=97 \ + --padded_decode=False \ + --num_gpus=1 \ + --dtype=fp16 \ + --distribution_strategy='one_device' \ + --enable_time_history=true \ + --log_steps=1000 \ + --loss_scale='dynamic' \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --profiling=${profiling} \ + --profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#############冒烟看护######################### +BatchSize=${batch_size} +#设备类型 +DeviceType=`uname -m` +#用例名称 +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#grep "Train history" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print$8}'|sed 's/,//g'|sed 's/\[//g'|sed 's/\]//g' |sed 's/\}//g'>> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +ActualLoss=`grep 10000/10000 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $NF}'` +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep "Bleu" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F ": " '{print $2}' | tail -n 1` +train_accuracy=`grep 10000/10000 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $NF}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +##获取Loss +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 +grep 10000/10000 $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $NF}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值 +#ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 924b43d942e912d40a9f6db80c3373d3a3ff8b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Spencer=E5=85=94=E5=AD=90?= <11326804+spencer-rabbit@user.noreply.gitee.com> Date: Fri, 30 Dec 2022 07:11:03 +0000 Subject: [PATCH 3/4] =?UTF-8?q?add=20TensorFlow/built-in/recommendation/Wi?= =?UTF-8?q?deDeep=5FID2940=5Ffor=5FTensorFlow/test/train=5FRT2=5Ffull=5F1p?= =?UTF-8?q?.sh.=20=E5=A2=9E=E5=8A=A0train=5FRT2=5Ffull=5F1p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Spencer兔子 <11326804+spencer-rabbit@user.noreply.gitee.com> --- .../test/train_RT2_full_1p.sh | 170 ++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2940_for_TensorFlow/test/train_RT2_full_1p.sh diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2940_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2940_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 000000000..b5e4709e8 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2940_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=131072 +#网络名称,同目录名称 +Network="WideDeep_TF_ID2940_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=1 +#训练epoch,可选 +train_epochs=120 + +#参数配置 +data_path="" + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` +elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +##############执行训练########## +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +fi + +wait + +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +fi + +if [ -d $cur_path/../config/1p_$ASCEND_DEVICE.json ];then + export RANK_TABLE_FILE=$cur_path/../config/1p_$ASCEND_DEVICE.json + export RANK_ID=$ASCEND_DEVICE_ID +else + export RANK_TABLE_FILE=$cur_path/../config/1p_0.json + export RANK_ID=0 +fi + +cd $cur_path/../ +start=$(date +%s) +nohup python3 -m trainer.task --gpu \ + --model_type=wide \ + --train_data_pattern=$data_path/outbrain/tfrecords/train/part* \ + --eval_data_pattern=$data_path/outbrain/tfrecords/eval/part* \ + --model_dir=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --transformed_metadata_path=$data_path/outbrain/tfrecords \ + --num_epochs=$train_epochs > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2etime=$(( $end - $start )) + + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 + +Time=`grep "INFO:tensorflow:global_step/sec: " $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $2}' | tail -n 2 | head -n +1` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*'${Time}'}'` + + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'INFO:tensorflow:Saving dict for global step' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| tail -n 1 |awk -F' = ' '{print $3}' |awk -F',' '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk -v x=320 -v y="$FPS" 'BECIN{printf "%3.f\n",y/x}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +loss=`grep 'INFO:tensorflow:loss' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tr -d '\b\r' | grep -Eo "INFO:tensorflow:loss = [0-9]*\.[0-9]*" | awk -F' = ' '{print $2}'` +echo "${loss}"> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | tail -n 1` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 1813a77b1f563a7b4bd53c8e5a8931c1e638cb2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Spencer=E5=85=94=E5=AD=90?= <11326804+spencer-rabbit@user.noreply.gitee.com> Date: Fri, 30 Dec 2022 07:21:27 +0000 Subject: [PATCH 4/4] =?UTF-8?q?add=20Swin-Transformer=5FID2374=5Ffor=5FTen?= =?UTF-8?q?sorFlow2.X/test/train=5FRT2=5Ffull=5F1p.sh.=20=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0train=5FRT2=5Ffull=5F1p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Spencer兔子 <11326804+spencer-rabbit@user.noreply.gitee.com> --- .../test/train_RT2_full_1p.sh | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_full_1p.sh diff --git a/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_full_1p.sh b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_full_1p.sh new file mode 100644 index 000000000..366ab7f63 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_full_1p.sh @@ -0,0 +1,152 @@ +#!/bin/bash + +cur_path=`pwd`/../ +#失败用例打屏 +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=32 +#网络名称,同目录名称 +Network="Swin-Transformer_ID2374_for_TensorFlow2.X" +#Device数量,单卡默认为1 +RankSize=1 +#训练epoch,可选 +train_epochs= +#训练step +train_steps= +#学习率 +learning_rate=0.01 + +#参数配置 +data_path="" + +############维测参数############## +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +if [[ $over_dump == True ]];then + over_dump_path=$cur_path/test/overflow_dump #此处cur_path为代码根目录 + mkdir -p ${over_dump_path} +fi +data_dump_flag=False +data_dump_step="10" +profiling=False +use_mixlist=False +mixlist_file="./configs/ops_info.json" +fusion_off_flag=False +fusion_off_file="./configs/fusion_switch.cfg" +############维测参数############## + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_full_1p.sh " + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --use_mixlist* ]];then + use_mixlist=`echo ${para#*=}` + elif [[ $para == --mixlist_file* ]];then + mixlist_file=`echo ${para#*=}` + elif [[ $para == --fusion_off_flag* ]];then + fusion_off_flag=`echo ${para#*=}` + elif [[ $para == --fusion_off_file* ]];then + fusion_off_file=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +##############执行训练########## +cd $cur_path +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +#拷贝并修改数据集txt文件 +cp -r ${data_path}/cifar-100-python /root/.keras/datasets/ + +start=$(date +%s) +nohup python3 swin_transformers.py \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --profiling=${profiling} \ + --use_mixlist=${use_mixlist} \ + --fusion_off_flag=${fusion_off_flag} \ + --mixlist_file=${mixlist_file} \ + --fusion_off_file=${fusion_off_file} \ + --profiling_dump_path=${profiling_dump_path}} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2etime=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2etime" + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +TrainingTime=`grep "352/352" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '352/352' '{print $2}'|grep -v 'ETA'|grep 'loss:'|awk 'END {print $4}'|cut -d 'm' -f -1` + + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=`awk 'BEGIN{printf "%.2f\n", '1000'*'${batch_size}'/'${TrainingTime}'}'` + +#获取模型精度 +train_accuracy=`grep "352/352" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '352/352' '{print $2}'|grep 'loss:'|awk 'END {print $10}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "352/352" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '352/352' '{print $2}'|grep -v 'ETA'|grep 'loss:'|awk '{print $7}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2etime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee