diff --git a/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..20d91ca02e6f5d79d434e799f0ac442151f03748 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +export JOB_ID=10000 + +exec_mode='train' # or 'train_and_eval' +eval_after_training=True + +backbone='resnet50' +backbone_ckpt_path='/npu/traindata/resnet50_ckpt' +data_path='/npu/traindata/coco_official_2017' + +batch_size=2 +steps=720000 + +learning_rate_type='cosine' # or 'step' +learning_rate=0.003 +warmup_learning_rate=0.00025 +warmup_steps=16000 +learning_rate_levels='[0.0003, 0.00003]' +learning_rate_steps='[480000, 640000]' + +precision_mode='allow_mix_precision' +loss_scale_flag=0 +loss_scale_value=256 +overflow_dump=False + +########## params from command line ########## + +for arg in $* ; do + if [ ${arg:0:2} == '--' ]; then + arg=${arg:2} + pos=`expr index "$arg" =` + if [ $pos > 0 ]; then + var_name=${arg:0:$pos-1} + var_value=${arg:$pos} + eval $var_name=$var_value + fi + fi +done + +if [ ! $output_dir ]; then + output_dir="`pwd`/output/" +fi +echo output_dir=$output_dir + +training_file_pattern=${training_file_pattern:-$data_path'/tfrecord/train*'} +validation_file_pattern=${validation_file_pattern:-$data_path'/tfrecord/val*'} +val_json_file=${val_json_file:-$data_path'/annotations/instances_val2017.json'} + +########## build params_override ########## + +unset params_override +params_override=${params_override}backbone=$backbone, +params_override=${params_override}checkpoint="'$backbone_ckpt_path'", +params_override=${params_override}training_file_pattern="'$training_file_pattern'", +params_override=${params_override}validation_file_pattern="'$validation_file_pattern'", +params_override=${params_override}val_json_file="'$val_json_file'", +params_override=${params_override}train_batch_size=$batch_size, +params_override=${params_override}total_steps=$steps, +params_override=${params_override}learning_rate_type=$learning_rate_type, +params_override=${params_override}init_learning_rate=$learning_rate, +params_override=${params_override}warmup_learning_rate=$warmup_learning_rate, +params_override=${params_override}warmup_steps=$warmup_steps, +params_override=${params_override}learning_rate_levels="'$learning_rate_levels'", +params_override=${params_override}learning_rate_steps="'$learning_rate_steps'", +params_override=${params_override}npu_precision_mode=$precision_mode, +params_override=${params_override}npu_loss_scale_flag=$loss_scale_flag, +params_override=${params_override}npu_loss_scale=$loss_scale_value, +params_override=${params_override}npu_overflow_dump=$overflow_dump, + +echo [params_override] "$params_override" + +########## prepare environment ########## + +export RANK_SIZE=1 + +if [ ! $RANK_ID_START ]; then + if [ $ASCEND_DEVICE_ID ]; then + RANK_ID_START=$ASCEND_DEVICE_ID + elif [ $DEVICE_ID ]; then + RANK_ID_START=$DEVICE_ID + else + RANK_ID_START=0 + fi +fi +export RANK_ID_START +echo "RANK_ID_START="$RANK_ID_START + +BASE_PATH=`cd $(dirname $0); pwd`/../FasterRcnn +echo "BASE_PATH="$BASE_PATH + +########## run ########## + +start_time=$(date +%s) + +pids= +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + echo + /usr/local/Ascend/driver/tools/msnpureport -d $RANK_ID -g error + + TMP_PATH=$output_dir/$RANK_ID + mkdir -p $TMP_PATH + cd $TMP_PATH + + rm -f configs + ln -s $BASE_PATH/configs configs + + export RANK_ID + export DEVICE_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + + python3 $BASE_PATH/mask_rcnn_main.py --mode=$exec_mode \ + --eval_after_training=$eval_after_training \ + --model_dir=$TMP_PATH/result \ + --num_gpus=$RANK_SIZE \ + --params_override="$params_override" \ + $@ 2>&1 | tee $TMP_PATH/train_${RANK_ID}.log & + + pids[$RANK_ID-$RANK_ID_START]="$RANK_ID $!" + cd - +done + +sleep 1 +echo "########## Waiting for pids: "${pids[*]} + +for pid in "${pids[@]}"; do + pid=($pid) + RANK_ID=${pid[0]} + pid=${pid[1]} + + wait $pid + ret=$? + echo "******************** train finished ******************** $RANK_ID - $pid - ret : $ret" + + ############################## E2E训练时长 ############################## + end_time=$(date +%s) + e2e_time=$(( $end_time - $start_time )) + echo "Final Training Duration sec : $e2e_time" + + ############################## 业务日志 ############################## + grep ERROR /root/ascend/log/plog/plog-${pid}_*.log > $output_dir/$RANK_ID/plog_err.log + + log_file=$output_dir/$RANK_ID/train_${RANK_ID}.log + + ############################## 性能结果处理 ############################## + echo "-------------------- Final result --------------------" + #性能FPS计算,需要根据网络修改 + FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $log_file|awk 'END {print $2}'` + FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${FPS}'}'` + echo "Final Performance images/sec : $FPS" + + ############################## 精度结果处理 ############################## + #精度计算,需要根据网络修改 + train_accuracy=`grep "Average Precision" $log_file | awk 'NR==1 {print $NF}'` + if [ $train_accuracy ]; then + echo "Final Training Accuracy mAP: $train_accuracy" + fi + + ############################## 性能看护 ############################## + + Network=FasterRcnn_resnet50_ID0010_for_TensorFlow + + DeviceType=`uname -m` + CaseName=${Network}_${backbone}_bs${batch_size}_${RANK_SIZE}'p'_'RT2'_'acc' + ActualFPS=${FPS} + TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + + # 提取Loss到train_${CaseName}_loss.txt中,需要根据模型修改 + grep "INFO:tensorflow:loss" $log_file|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' >> $output_dir/$RANK_ID/train_${CaseName}_loss.txt + + ActualLoss=`awk 'END {print}' $output_dir/$RANK_ID/train_${CaseName}_loss.txt` + echo "Network = ${Network}" > $output_dir/$RANK_ID/${CaseName}.log + echo "RankSize = ${RANK_SIZE}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "BatchSize = ${batch_size}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "DeviceType = ${DeviceType}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "CaseName = ${CaseName}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "ActualFPS = ${ActualFPS}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "TrainingTime = ${TrainingTime}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "ActualLoss = ${ActualLoss}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "E2ETrainingTime = ${e2e_time}" >> $output_dir/$RANK_ID/${CaseName}.log + if [ $train_accuracy ]; then + echo "TrainAccuracy = ${train_accuracy}" >> $output_dir/$RANK_ID/${CaseName}.log + fi + + #eval版本需求开发中,精度结果临时看护最终的loss + echo "Final Training Accuracy loss: $ActualLoss" +done + diff --git a/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_full_8p.sh b/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..ce8b77d248b20d0e44dd618b15a5f3ae47ce0c07 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_full_8p.sh @@ -0,0 +1,200 @@ +#!/bin/bash + +export JOB_ID=10000 + +exec_mode='train' # or 'train_and_eval' +eval_after_training=True + +backbone='resnet50' +backbone_ckpt_path='/npu/traindata/resnet50_ckpt' +data_path='/npu/traindata/coco_official_2017' + +batch_size=2 +steps=90000 + +learning_rate_type='cosine' # or 'step' +learning_rate=0.02 +warmup_learning_rate=0.0067 +warmup_steps=500 +learning_rate_levels='[0.002, 0.0002]' +learning_rate_steps='[60000, 80000]' + +precision_mode='allow_mix_precision' +loss_scale_flag=0 +loss_scale_value=256 +overflow_dump=False + +########## params from command line ########## + +for arg in $* ; do + if [ ${arg:0:2} == '--' ]; then + arg=${arg:2} + pos=`expr index "$arg" =` + if [ $pos > 0 ]; then + var_name=${arg:0:$pos-1} + var_value=${arg:$pos} + eval $var_name=$var_value + fi + fi +done + +for para in $* +do + if [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +if [ ! $output_dir ]; then + output_dir="`pwd`/output/" +fi +echo output_dir=$output_dir + +training_file_pattern=${training_file_pattern:-$data_path'/tfrecord/train*'} +validation_file_pattern=${validation_file_pattern:-$data_path'/tfrecord/val*'} +val_json_file=${val_json_file:-$data_path'/annotations/instances_val2017.json'} + +########## build params_override ########## + +unset params_override +params_override=${params_override}backbone=$backbone, +params_override=${params_override}checkpoint="'$backbone_ckpt_path'", +params_override=${params_override}training_file_pattern="'$training_file_pattern'", +params_override=${params_override}validation_file_pattern="'$validation_file_pattern'", +params_override=${params_override}val_json_file="'$val_json_file'", +params_override=${params_override}train_batch_size=$batch_size, +params_override=${params_override}total_steps=$steps, +params_override=${params_override}learning_rate_type=$learning_rate_type, +params_override=${params_override}init_learning_rate=$learning_rate, +params_override=${params_override}warmup_learning_rate=$warmup_learning_rate, +params_override=${params_override}warmup_steps=$warmup_steps, +params_override=${params_override}learning_rate_levels="'$learning_rate_levels'", +params_override=${params_override}learning_rate_steps="'$learning_rate_steps'", +params_override=${params_override}npu_precision_mode=$precision_mode, +params_override=${params_override}npu_loss_scale_flag=$loss_scale_flag, +params_override=${params_override}npu_loss_scale=$loss_scale_value, +params_override=${params_override}npu_overflow_dump=$overflow_dump, + +echo [params_override] "$params_override" + +########## prepare environment ########## + +export RANK_SIZE=8 +export RANK_ID_START=0 + +BASE_PATH=`cd $(dirname $0); pwd`/../FasterRcnn +echo "BASE_PATH="$BASE_PATH + +export RANK_TABLE_FILE=$BASE_PATH/npu_config/8p.json + +rm -rf /root/ascend/log + +########## run ########## + +start_time=$(date +%s) + +pids= +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + echo + /usr/local/Ascend/driver/tools/msnpureport -d $RANK_ID -g error + + TMP_PATH=$output_dir/$RANK_ID + mkdir -p $TMP_PATH + cd $TMP_PATH + + rm -f configs + ln -s $BASE_PATH/configs configs + + export RANK_ID + export DEVICE_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + bind_core="taskset -c $a-$c" + + ${bind_core} python3 $BASE_PATH/mask_rcnn_main.py --mode=$exec_mode \ + --eval_after_training=$eval_after_training \ + --model_dir=$TMP_PATH/result \ + --num_gpus=$RANK_SIZE \ + --params_override="$params_override" \ + $@ 2>&1 | tee $TMP_PATH/train_${RANK_ID}.log & + + pids[$RANK_ID-$RANK_ID_START]="$RANK_ID $!" + cd - +done + +sleep 1 +echo "########## Waiting for pids: "${pids[*]} + +for pid in "${pids[@]}"; do + pid=($pid) + RANK_ID=${pid[0]} + pid=${pid[1]} + + wait $pid + ret=$? + echo "******************** train finished ******************** $RANK_ID - $pid - ret : $ret" + + ############################## E2E训练时长 ############################## + end_time=$(date +%s) + e2e_time=$(( $end_time - $start_time )) + echo "Final Training Duration sec : $e2e_time" + + ############################## 业务日志 ############################## + grep ERROR /root/ascend/log/plog/plog-${pid}_*.log > $output_dir/$RANK_ID/plog_err.log + + log_file=$output_dir/$RANK_ID/train_${RANK_ID}.log + + ############################## 性能结果处理 ############################## + echo "-------------------- Final result --------------------" + #性能FPS计算,需要根据网络修改 + FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $log_file|awk 'END {print $2}'` + FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${FPS}'*8}'` + echo "Final Performance images/sec : $FPS" + + ############################## 精度结果处理 ############################## + #精度计算,需要根据网络修改 + train_accuracy=`grep "Average Precision" $log_file | awk 'NR==1 {print $NF}'` + if [ $train_accuracy ]; then + echo "Final Training Accuracy mAP: $train_accuracy" + fi + + ############################## 性能看护 ############################## + + Network=FasterRcnn_resnet50_ID0010_for_TensorFlow + + DeviceType=`uname -m` + CaseName=${Network}${name_bind}_${backbone}_bs${batch_size}_${RANK_SIZE}'p'_'RT2'_'acc' + ActualFPS=${FPS} + TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + + # 提取Loss到train_${CaseName}_loss.txt中,需要根据模型修改 + grep "INFO:tensorflow:loss" $log_file|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' >> $output_dir/$RANK_ID/train_${CaseName}_loss.txt + + ActualLoss=`awk 'END {print}' $output_dir/7/train_${CaseName}_loss.txt` + echo "Network = ${Network}" > $output_dir/7/${CaseName}.log + echo "RankSize = ${RANK_SIZE}" >> $output_dir/7/${CaseName}.log + echo "BatchSize = ${batch_size}" >> $output_dir/7/${CaseName}.log + echo "DeviceType = ${DeviceType}" >> $output_dir/7/${CaseName}.log + echo "CaseName = ${CaseName}" >> $output_dir/7/${CaseName}.log + echo "ActualFPS = ${ActualFPS}" >> $output_dir/7/${CaseName}.log + echo "TrainingTime = ${TrainingTime}" >> $output_dir/7/${CaseName}.log + echo "ActualLoss = ${ActualLoss}" >> $output_dir/7/${CaseName}.log + echo "E2ETrainingTime = ${e2e_time}" >> $output_dir/7/${CaseName}.log + if [ $train_accuracy ]; then + echo "TrainAccuracy = ${train_accuracy}" >> $output_dir/7/${CaseName}.log + fi + + #eval版本需求开发中,精度结果临时看护最终的loss + echo "Final Training Accuracy loss: $ActualLoss" +done + +echo "########## copying slog ##########" +cp -r /root/ascend/log/ $output_dir/slog +echo "########## DONE copying slog ##########" diff --git a/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_full_8p.sh b/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..41f18d6c4de30cb2f77428d5e661b492a74b9e8f --- /dev/null +++ b/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_full_8p.sh @@ -0,0 +1,215 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' + +#集合通信参数,不需要修改 +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export JOB_ID=9999001 +export RANK_SIZE=8 +export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="MaskRcnn_ID0011_for_TensorFlow" + +batch_size=2 + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#autotune时,先开启autotune执行单P训练,不需要修改 +if [[ $autotune == True ]]; then + train_full_1p.sh --autotune=$autotune --data_path=$data_path + wait + autotune=False +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + # 自行添加环境变量 + + export DEVICE_ID=$RANK_ID + DEVICE_INDEX=$DEVICE_ID + export DEVICE_INDEX=${DEVICE_INDEX} + export FUSION_TENSOR_SIZE=1000000000 + # for producible results + export TF_DETERMINISTIC_OPS=1 + export TF_CUDNN_DETERMINISM=1 + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + ${bind_core} python3 mask_rcnn_main.py --mode=train_and_eval \ + --rank=$RANK_ID \ + --Data_path=$data_path \ + --train_batch_size=2 \ + --training_file_pattern=${data_path}/train* \ + --validation_file_pattern=${data_path}/val* \ + --val_json_file=${data_path}/instances_val2017.json \ + --eval_batch_size=2 \ + --model_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + #--data_dump_flag=${data_dump_flag} \ + #--data_dump_step=${data_dump_step} \ + #--data_dump_path=${data_dump_path} \ + #--profiling=${profiling} \ + #--profiling_dump_path=${profiling_dump_path} \ + #--autotune=${autotune} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPSper=`grep "] global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}'` +FPS=`awk 'BEGIN{printf "%f\n",'${batch_size}'*'${RANK_SIZE}'*'${FPSper}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "Average Precision" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|head -1|awk '{print $13}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",1/'${FPSper}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "] loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $7}'|cut -d , -f 1 > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/cv/detection/OpenPose_ID0117_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/cv/detection/OpenPose_ID0117_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..ee6396984ffe4d09a793ac2ebf8bfc50c6211b87 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/OpenPose_ID0117_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="OpenPose_ID0117_for_TensorFlow" +#训练epoch +train_epoch=2000 +#训练batch_size +batch_size=32 +#学习率 +learning_rate=0.0001 +#动态输入模式,不需要修改 +dynamic_input="" + + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_fp32_to_fp16" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + --train_epoch # of epoch for training + --learning_rate learning rate + --batch batch size + --modeldir model dir + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --train_epoch* ]];then + train_epoch=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --batch* ]];then + batch_size=`echo ${para#*=}` + elif [[ $para == --modeldir* ]];then + modeldir=`echo ${para#*=}` + elif [[ $para == --dynamic_input* ]];then + dynamic_input=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + + +#############执行训练######################### +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_path, --model_dir, --precision_mode, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + nohup python3 ${cur_path}/../Action/training/train_rt.py \ + --train_epoch=${train_epoch} \ + --data_path=${data_path} \ + --learning_rate=${learning_rate} \ + --modeldir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --batch=${batch_size} \ + --profiling=${profiling} \ + --profiling_dump_path=${profiling_dump_path} \ + --autotune=${autotune} \ + --dynamic_input=${dynamic_input} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +if [ $? -ne 0 ];then + exit 1 +fi +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +stepvalue=(`grep -r "/step" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F - 'END {print $2}' | awk -F / '{print $1}'`) +function strindex() { + x="${1%%$2*}" + if [[ $x = $1 ]];then + echo -1 + else + echo ${#x} + return ${#x} + fi +} +index=$(strindex "${stepvalue[0]}" "s") +second=${stepvalue[0]:0:$index} +uindex=$(strindex "${stepvalue[1]}" "us") +usecond=${stepvalue[1]:0:$uindex} +step_sec=$(awk 'BEGIN{printf "%.4f\n",('$usecond'/'1000')}') +FPS=`awk 'BEGIN {printf "%.2f\n", '1000'*'${batch_size}'/'${step_sec}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=(`grep -r "/step" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F : 'END {print $3}' | awk '{print $1}'`) +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=${step_sec} + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "/step" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F : '{print $2}' | awk '{print $1}' > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt +#最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DynamicInput = ${dynamic_input}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..be3a195c01dfe9d2486d5145bf6ff9a19992c229 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,186 @@ +#!bin/bash +cur_path=`pwd` +export PYTHONPATH=$cur_path/../models/research:$cur_path/../models/research/slim:$PYTHONPATH + +#集合通信 +export RANK_SIZE=1 +export RANK_TABLE_FILE=$cur_path/../configs/${RANK_SIZE}p_${ASCEND_DEVICE_ID}.json +export JOB_ID=10087 +RANK_ID_START=0 + +#数据集参数 +data_path="/data" + + +#训练参数,需要根据模型修改 +Network="SSD-Resnet50V1-FPN_ID0116_for_TensorFlow" +num_train_steps=100000 +batch_size=32 +ckpt_path=/checkpoints +pipeline_config=$cur_path/../models/research/configs/ssd320_full_1gpus.config + +#维测参数 +overflow_dump=False +overflow_dump_path=$cur_path/output/overflow_dump +step_dump=False +step_dump_path=$cur_path/output/step_dump +check_loss_scale=Flase + +#帮助提示,需要根据网络修改 +if [[ $1 == --help || $1 == -h ]];then + echo "usage: ./train_performance_1p.sh " + + echo "" + echo "parameter explain: + --num_train_steps training steps + --data_path source data of training + --ckpt_path pre-checkpoint path + --pipeline_config pipeline config path + --overflow_dump overflow detection,default is False + --overflow_dump_path overflow dump path + --check_loss_scale check whether loss scale is valid, default is False + --step_dump Dump step data, default is False, can only set when overflow_dump is False + --step_dump_path step_dump_path + -h/--help Show help message + " + exit 1 +fi + +#入参设置,需要根据网络修改 +for para in $* +do + if [[ $para == --num_train_steps* ]];then + num_train_steps=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --pipeline_config* ]];then + pipeline_config=`echo ${para#*=}` + elif [[ $para == --overflow_dump* ]];then + overflow_dump=`echo ${para#*=}` + if [ -d ${overflow_dump_path} ];then + echo "overflow dump path: ${overflow_dump_path}" + else + mkdir -p ${overflow_dump_path} + fi + elif [[ $para == --check_loss_scale* ]];then + check_loss_scale=`echo ${para#*=}` + elif [[ $para == --step_dump* ]];then + step_dump=`echo ${para#*=}` + if [ -d ${step_dump_path} ];then + echo "step dump path: ${step_dump_path}" + else + mkdir -p ${step_dump_path} + fi + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +##########################执行训练######################### +start_time=$(date +%s) +cd $cur_path/../models/research +if [ -f ${pipeline_config}.bak ];then + cp ${pipeline_config}.bak ${pipeline_config} +else + cp ${pipeline_config} ${pipeline_config}.bak +fi + +sed -i "s%/checkpoints%${ckpt_path}%p" ${pipeline_config} +sed -i "s%/data/coco2017_tfrecords%${data_path}/coco2017_tfrecords%p" ${pipeline_config} + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); + do + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then + rm -rf $cur_path/output/${ASCEND_DEVICE_ID} + mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} + else + mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} + fi + +#训练执行脚本,需要根据网络修改 + nohup python3 -u ./object_detection/model_main_rt.py \ + --pipeline_config_path=${pipeline_config} \ + --model_dir=$cur_path/output/${ASCEND_DEVICE_ID} \ + --data_path=${data_path} \ + --overflow_dump_path=${overflow_dump_path} \ + --step_dump_path=${step_dump_path} \ + --alsologtostder \ + --amp \ + --num_train_steps=${num_train_steps} \ + "${@:1}" > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +##########################业务日志######################### +grep ERROR $HOME/ascend/log/plog/*.log > $cur_path/output/$ASCEND_DEVICE_ID/plog_err.log + +################################性能结果处理######################### +echo "-----------------------Final result------------------------" +#性能FPS计算,需要根据网络修改 +FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` + +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${FPS}'}'` +echo "Final Performance images/sec : $FPS" +################################精度结果处理######################### +#精度计算,需要根据网络修改 +train_accuracy=`grep Precision $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep Average |awk 'NR==1 {print $13}'` + +#echo 'Final Training Accuracy mAP: $train_accuracy' +################################E2E训练时长########################## +echo "Final Training Duration sec : $e2e_time" + +################################性能看护############################# +DeviceType=`uname -m` +CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'RT2'_'acc' +ActualFPS=${FPS} +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型修改 +grep INFO:tensorflow:loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +#eval版本需求开发中,精度结果临时看护最终的loss +echo "Final Training Accuracy loss: $ActualLoss" + +##获取错误信息 +#系统错误消息 +#error_msg="CanonicalizeShape failed, node:Postprocessor/BatchMultiClassNonMaxSuppression/MultiClassNonMaxSuppression/non_max_suppression/NonMaxSuppressionV3" +error_msg="E19999: Inner Error" + +#判断错误信息是否和历史版本一致 +Status=`grep "${error_msg}" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | wc -l` + +#失败阶段 +ModelStatus="图执行FAIL" + +#DTS单号 +#DTS_Number="DTS202105130LVO7FP0J00,DTS202105130O6E1SP1400" +DTS_Number="DTS202105200RLRJ1P1300" +echo "ModelStatus = ${ModelStatus}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DTS_Number = ${DTS_Number}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Status = ${Status}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "error_msg = ${error_msg}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_1p_mobilenetv1_fpn.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_1p_mobilenetv1_fpn.sh new file mode 100644 index 0000000000000000000000000000000000000000..3048f0d14acae6ee15bf12959f456a513da4914d --- /dev/null +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_1p_mobilenetv1_fpn.sh @@ -0,0 +1,163 @@ +#!bin/bash +cur_path=`pwd` +#临时补丁,需要根据网络修改 +#cp $ASCEND_OPP_PATH/op_impl/built-in/ai_core/tbe/config/ascend910/aic-ascend910-ops-info.json $cur_path/aic-ascend910-ops-info.json.bak -f +#python3 ops_info_patch.py + +#环境设置,需要根据网络修改 +export PYTHONPATH=$cur_path/../models/research:$cur_path/../models/research/slim:$PYTHONPATH + +#集合通信 +export RANK_SIZE=1 +export RANK_TABLE_FILE=$cur_path/../configs/${RANK_SIZE}p_${ASCEND_DEVICE_ID}.json +export JOB_ID=10087 +RANK_ID_START=0 + +#数据集参数 +data_path="/data" + + +#训练参数,需要根据模型修改 +Network="SSD-MobilenetV1-FPN_ID1459_for_TensorFlow" +num_train_steps=100000 +batch_size=16 +ckpt_path=/checkpoints +pipeline_config=$cur_path/../models/research/configs/ssd_mobilenet_v1_fpn_shared_box_predictor_640x640_coco14_sync_1gpus.config + +#维测参数 +overflow_dump=False +overflow_dump_path=$cur_path/output/overflow_dump +step_dump=False +step_dump_path=$cur_path/output/step_dump +check_loss_scale=Flase + +#帮助提示,需要根据网络修改 +if [[ $1 == --help || $1 == -h ]];then + echo "usage: ./train_full_1p.sh " + + echo "" + echo "parameter explain: + --num_train_steps training steps + --data_path source data of training + --ckpt_path pre-checkpoint path + --pipeline_config pipeline config path + --overflow_dump overflow detection,default is False + --overflow_dump_path overflow dump path + --check_loss_scale check whether loss scale is valid, default is False + --step_dump Dump step data, default is False, can only set when overflow_dump is False + --step_dump_path step_dump_path + -h/--help Show help message + " + exit 1 +fi + +#入参设置,需要根据网络修改 +for para in $* +do + if [[ $para == --num_train_steps* ]];then + num_train_steps=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --pipeline_config* ]];then + pipeline_config=`echo ${para#*=}` + elif [[ $para == --overflow_dump* ]];then + overflow_dump=`echo ${para#*=}` + if [ -d ${overflow_dump_path} ];then + echo "overflow dump path: ${overflow_dump_path}" + else + mkdir -p ${overflow_dump_path} + fi + elif [[ $para == --check_loss_scale* ]];then + check_loss_scale=`echo ${para#*=}` + elif [[ $para == --step_dump* ]];then + step_dump=`echo ${para#*=}` + if [ -d ${step_dump_path} ];then + echo "step dump path: ${step_dump_path}" + else + mkdir -p ${step_dump_path} + fi + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +##########################执行训练######################### +start_time=$(date +%s) +cd $cur_path/../models/research +if [ -f ${pipeline_config}.bak ];then + cp ${pipeline_config}.bak ${pipeline_config} +else + cp ${pipeline_config} ${pipeline_config}.bak +fi + +#sed -i "s%/checkpoints%${ckpt_path}%p" ${pipeline_config} +sed -i "s%/data/coco2017_tfrecords%${data_path}/coco2017_tfrecords%p" ${pipeline_config} + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); + do + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then + rm -rf $cur_path/output/${ASCEND_DEVICE_ID} + mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} + else + mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} + fi + +#训练执行脚本,需要根据网络修改 + nohup python3 -u ./object_detection/model_main_rt.py \ + --pipeline_config_path=${pipeline_config} \ + --model_dir=$cur_path/output/${ASCEND_DEVICE_ID}/npu_ckpt_mobilenetv1_fpn_${RANK_SIZE}p \ + --data_path=${data_path} \ + --overflow_dump_path=${overflow_dump_path} \ + --step_dump_path=${step_dump_path} \ + --alsologtostder \ + --amp \ + --num_train_steps=${num_train_steps} \ + "${@:1}" > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +################################性能结果处理######################### +echo "-----------------------Final result------------------------" +#性能FPS计算,需要根据网络修改 +FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $2}'|tail -2|head -1` + +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${FPS}'*'${RANK_SIZE}'}'` +echo "Final Performance images/sec : $FPS" +################################精度结果处理######################### +#精度计算,需要根据网络修改 +train_accuracy=`grep Precision $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep Average |awk 'NR==1 {print $13}'` + +#echo 'Final Training Accuracy mAP: $train_accuracy' +################################E2E训练时长########################## +echo "Final Training Duration sec : $e2e_time" + +################################性能看护############################# +DeviceType=`uname -m` +CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'RT2'_'acc' +ActualFPS=${FPS} +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型修改 +grep INFO:tensorflow:loss $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $3}'|sed 's/,//g'|sed '/^$/d'|head -n 1001 >> $cur_path/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt + +ActualLoss=`awk 'END {print}' $cur_path/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` +echo "Network = ${Network}" > $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_8p_mobilenetv2.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_8p_mobilenetv2.sh new file mode 100644 index 0000000000000000000000000000000000000000..8378680f5bdeecbf8b5506a195cac5b3509ca816 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_full_8p_mobilenetv2.sh @@ -0,0 +1,145 @@ +#!bin/bash +cur_path=`pwd` + +#环境设置,需要根据网络修改 +export PYTHONPATH=$cur_path/../models/research:$cur_path/../models/research/slim:$PYTHONPATH +export HCCL_CONNECT_TIMEOUT=200 +#集合通信 +export RANK_SIZE=8 +export RANK_TABLE_FILE=$cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +RANK_ID_START=0 +ASCEND_DEVICE_ID_START=0 + +#数据集参数 +data_path="/data" +use_conda=0 + +#训练参数,需要根据模型修改 +Network="SSD-MobilenetV2_ID0499_for_TensorFlow" +num_train_steps=50000 +batch_size=24 +ckpt_path=/checkpoints +pipeline_config=$cur_path/../models/research/configs/ssd_mobilenet_v2_coco_8p.config + +#帮助提示,需要根据网络修改 +if [[ $1 == --help || $1 == -h ]];then + echo "usage: ./train_performance_8p.sh " + echo "" + echo "parameter explain: + --num_train_steps training steps + --data_path source data of training + --ckpt_path pre-checkpoint path + --pipeline_config pipeline config path + --skip_eval whether to skip eval + -h/--help Show help message + " + exit 1 +fi + +#入参设置,需要根据网络修改 +for para in $* +do + if [[ $para == --num_train_steps* ]];then + num_train_steps=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --pipeline_config* ]];then + pipeline_config=`echo ${para#*=}` + elif [[ $para == --use_conda* ]];then + use_conda=`echo ${para#*=}` + elif [[ $para == --skip_eval* ]];then + skip_eval=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +##########################执行训练######################### +start_time=$(date +%s) +cd $cur_path/../models/research +if [ -f ${pipeline_config}.bak ];then + cp ${pipeline_config}.bak ${pipeline_config} +else + cp ${pipeline_config} ${pipeline_config}.bak +fi + +# 更改参数 +sed -i "s%/data/coco2017_tfrecords%${data_path}/coco2017_tfrecords%p" ${pipeline_config} + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); + do + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$((ASCEND_DEVICE_ID_START+RANK_ID)) + echo "Device ID: $ASCEND_DEVICE_ID" + if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then + rm -rf $cur_path/output/${ASCEND_DEVICE_ID} + mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} + else + mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} + fi + + #训练执行脚本,需要根据网络修改 + nohup python3 -u ./object_detection/model_main_rt.py \ + --pipeline_config_path=${pipeline_config} \ + --model_dir=$cur_path/output/${ASCEND_DEVICE_ID}/npu_ckpt_mobilenetv2_${RANK_SIZE}p\ + --data_path=${data_path} \ + --alsologtostder \ + --amp \ + --num_train_steps=${num_train_steps} \ + "${@:1}" > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +echo "Final Training Duration sec : $e2e_time" +ASCEND_DEVICE_ID=0 + +# 参数回改 +sed -i "s%${data_path}/coco2017_tfrecords%/data/coco2017_tfrecords%p" ${pipeline_config} + + +################################性能结果处理######################### +echo "-----------------------Final result------------------------" +# 性能FPS计算,需要根据网络修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $2}'|tail -2|head -n 1` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${step_sec}'}'` +echo "Final Performance images/sec : ${FPS}" + +#################################精度结果处理######################### +# 精度计算,需要根据网络修改 +train_accuracy=`grep Precision $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'NR==1 {print $13}'` +echo "Final Training Accuracy mAP: ${train_accuracy}" + +#################################性能看护############################# +# 训练用例信息,不需要修改 +DeviceType=`uname -m` +BatchSize=${batch_size} +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' +ActualFPS=${FPS} +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + +#################################Loss######################### +ASCEND_DEVICE_ID=7 +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型修改 +grep INFO:tensorflow:loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +echo "Final Training Accuracy loss: ${ActualLoss}" + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/cv/image_classification/3D_UNet-Medical_ID1462_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/cv/image_classification/3D_UNet-Medical_ID1462_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..9e932a7acebcdc0f6b0eeb4f6b5c57a3a61664d8 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/3D_UNet-Medical_ID1462_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="3D_UNet-Medical_ID1462_for_TensorFlow" +#Batch Size +batch_size=2 +#训练epoch,可选 +#train_epochs= +#训练step +train_steps=7000 +#学习率 +learning_rate=1e-3 + + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_fp32_to_fp16" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +#其他参数配置 +model_dir="models/base_model" +mode="train_eval" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --loss_scale_flag* ]];then + loss_scale_flag=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path \" must be config" + exit 1 +fi + +#3D Unet独有 +#sed -i "s|data/processed/train_dataset_resized.pckl|${data_path}/data/processed/train_dataset_resized.pckl|g" $cur_path/../$model_dir/params.json +#sed -i "s|data/processed/test_dataset.pckl|${data_path}/data/processed/test_dataset_resized.pckl|g" $cur_path/../$model_dir/params.json +#sed -i "s|\"max_train_steps\": 7000|\"max_train_steps\": ${train_steps}|g" $cur_path/../$model_dir/params.json +wait + +#训练开始时间,不需要修改 +start_time=$(date +%s) +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/.. +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID + fi + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + nohup python3 src/main.py \ + -step=${train_steps} \ + -train_path=${data_path}/data/processed/train_dataset_resized.pckl \ + -test_path=${data_path}/data/processed/test_dataset_resized.pckl \ + -precision_mode=$precision_mode \ + -model_dir=${model_dir} \ + -mode=${mode}> $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +steps_sec=`grep "global_step/sec" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $2}'| awk '{line[NR]=$0} END {for(i=3;i<=NR;i++) print line[i]}'|awk '{sum+=$1} END {print sum/NR}'` +#打印,不需要修改 +FPS=`echo "${steps_sec} ${batch_size}" | awk '{printf("%.4f\n",$1*$2)}'` +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "iou =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $12}'|cut -d , -f 1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" + + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ":loss = " $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $3}'|cut -d , -f 1 >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..abc1278a6e41364c534e33bc80d3eaa11fe5b504 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +cur_path=`pwd`/../ +#失败用例打屏 +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=32 +#网络名称,同目录名称 +Network="Face-ResNet50_ID1372_for_TensorFlow" +#Device数量,单卡默认为1 +RANK_SIZE=1 +#训练epoch,可选 +train_epochs=1 +#训练step +train_steps= +#学习率 +learning_rate= + +#参数配置 +data_path="" +#work_dir="$cur_path/estimator_working_dir" +#export_path="$cur_path/outputs/models/000001-first_generation" + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_full_1p.sh" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +##############执行训练########## +cd $cur_path +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +#sed -i "s|./CACD2000_Crop/|${data_path}/|g" TrainResNet.py +#sed -i "s|./label|${data_path}/label|g" TrainResNet.py + +start=$(date +%s) +nohup python3 TrainResNet_rt.py \ + --label_path ${data_path}/label/label_1200.npy \ + --image_name_path ${data_path}/label/name_1200.npy \ + --train_data_path ${data_path}/train_data/1200_data.npy \ + --parentPath ${data_path}/CACD2000_Crop/ \ + --epochs 100 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" + + +#输出性能FPS,需要模型审视修改 +steps_per_s=`grep steps_per_s ${cur_path}/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END{print $2}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${steps_per_s}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + + +#输出训练精度,需要模型审视修改 +train_accuracy="None" +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} + + +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'/'${FPS}'}'` + + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Cost $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $7}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + + +#最后一个迭代loss值(Read-Only) +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中(Read-Only) +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log + diff --git a/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..be97620cf9af511c99cac994447c7f90c066875c --- /dev/null +++ b/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,102 @@ +#!/bin/bash +cur_path=`pwd`/../ + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=128 +#网络名称,同目录名称 +Network="CNN-CTC_ID0683_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=1 +#训练epoch,可选 +train_epochs=10 +#学习率 +learning_rate=0.0001 + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_full_1p.sh --data_path=./imgs" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path \" must be config" + exit 1 +fi +##############执行训练########## +wait +cd $cur_path +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +start=$(date +%s) +nohup python3 main_rt.py --train_dir=${data_path}/train/ \ + --val_dir=${data_path}/val \ + --image_height=60 \ + --image_width=180 \ + --image_channel=1 \ + --out_channels=64 \ + --num_hidden=128 \ + --batch_size=$batch_size \ + --logs_dir=./log \ + --num_gpus=1 \ + --initial_learning_rate=$learning_rate \ + --num_epochs=$train_epochs \ + --mode=train > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2e_time" + +TrainingTime=`grep "batch " $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |awk 'END {print $5}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "accuracy" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $7}'|cut -d , -f 1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "lastbatch_err" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $12}' | cut -d , -f 1 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print $1}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/nlp/DS-CNN_RT2_ID1769_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/nlp/DS-CNN_RT2_ID1769_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..cfb678405c284d83792b2545fdb4fa5c34953bbf --- /dev/null +++ b/TensorFlow/built-in/nlp/DS-CNN_RT2_ID1769_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,208 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 + +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +#export ASCEND_GLOBAL_LOG_LEVEL=1 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +#ID1900_CarPeting_tf115_mobilebert +Network="DS-CNN_ID1769_for_TensorFlow" +batch_size=100 +train_steps=30000 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + + +#data_path='../' +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +cd $cur_path/.. +mkdir -p logs/checkpoints/ + +#sed -i "s|tmp|${data_path}/data|g" run_squad.py +#sed -i "s|./speech_dataset/|${data_path}/|g" train.py + + +#进入训练脚本目录,需要模型审视修改 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + +# #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +# #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +# nohup python3.7 $cur_path/../train_adda_seg.py ${data_path}/data/inria_test source_image source_label_index target_image adda_deeplab_v3p.h5 \ +# --optimizer adam \ +# --base_learning_rate 1e-4 \ +# --min_learning_rate 1e-7 \ +# --image_width 256 \ +# --image_height 256 \ +# --image_channel 3 \ +# --image_suffix .png \ +# --label_suffix .png \ +# --n_class 2 \ +# --batch_size 2 \ +# --iterations 50 \ +# --weight_decay 1e-4 \ +# --initializer he_normal \ +# --bn_epsilon 1e-3 \ +# --bn_momentum 0.99 \ +# --pre_trained_model ./logs/checkpoints/deeplab_v3p_base.h5 \ +# --source_fname_file ${data_path}/data/inria_test/source.txt \ +# --target_fname_file ${data_path}/data/inria_test/target.txt \ +# --logs_dir ./logs \ +# --augmentations flip_x,flip_y,random_crop \ +# --display 1 \ +# --snapshot 5 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + + echo "data_path is : " + echo "data_path is : $data_path" + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + python3 train.py \ + --model_architecture ds_cnn \ + --model_size_info 5 64 10 4 2 2 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 64 3 3 1 1 \ + --dct_coefficient_count 10 \ + --window_size_ms 40 \ + --window_stride_ms 20 \ + --learning_rate 0.0005,0.0001,0.00002 \ + --how_many_training_steps 10000,10000,10000 \ + --summaries_dir ./result/work/DS_CNN/DS_CNN_1/retrain_logs \ + --data_dir ${data_path} \ + --batch_size ${batch_size} \ + --train_dir ./result/work/DS_CNN/DS_CNN_1/training > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#sed -i "s|${data_path}/|./speech_dataset/|g" train.py +#sed -i "s|${data_path}/tmp|tmp|g" train.py +cd test + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +#h_step_2=`grep ' Step #2:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'|awk -F . '{print $1}'|awk -F : '{print $1}'` +#m_step_2=`grep ' Step #2:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'|awk -F . '{print $1}'|awk -F : '{print $2}'` +#s_step_2=`grep ' Step #2:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'|awk -F . '{print $1}'|awk -F : '{print $3}'` +#h_step_100=`grep ' Step #100:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'|awk -F . '{print $1}'|awk -F : '{print $1}'` +#m_step_100=`grep ' Step #100:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'|awk -F . '{print $1}'|awk -F : '{print $2}'` +#s_step_100=`grep ' Step #100:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'|awk -F . '{print $1}'|awk -F : '{print $3}'` +#step_sec=`awk 'BEGIN{printf "%.2f\n",('${h_step_100}'-'${h_step_2}')*3600+('${m_step_100}'-'${m_step_2}')*60+('${s_step_100}'-'${s_step_2}')}'` + +data_step_sec=`grep 'train duration:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $5}'` +train_step_sec=`grep 'train duration:' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $8}'` +step_sec=`awk 'BEGIN{printf "%.6f\n",'${data_step_sec}'+'${train_step_sec}'}'` +step_per_s=`awk 'BEGIN{printf "%.4f\n",1/'${step_sec}'}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${step_per_s}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'INFO:tensorflow.*Validation accuracy' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $6}'|awk -F % 'END {print $1}'` +train_accuracy=`awk 'BEGIN{printf "%.4f\n",'${train_accuracy}'/100}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep 'INFO:tensorflow.*cross entropy' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $9}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/nlp/Siamese_ID0506_for_TensorFlow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/nlp/Siamese_ID0506_for_TensorFlow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..ba2bff044f00490355235732e470af56ad5e5ded --- /dev/null +++ b/TensorFlow/built-in/nlp/Siamese_ID0506_for_TensorFlow/test/train_RT2_full_1p.sh @@ -0,0 +1,112 @@ +#!/bin/bash +cur_path=`pwd`/../ + + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=64 +#网络名称,同目录名称 +Network="Siamese_ID0506_for_TensorFlow" +#Device数量,单卡默认为1 +RANK_SIZE=1 +#训练epoch,可选 +train_epochs=100 +#训练step +train_steps= +#学习率 +learning_rate=5e-5 + +#参数配置 +data_path="" + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_full_1p.sh" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path \" must be config" + exit 1 +fi +##############执行训练########## +cd $cur_path + +wait + +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +start=$(date +%s) +nohup python3 train_rt.py \ + --num_epochs $train_epochs \ + --training_files=$data_path/person_match.train2 \ + --hidden_units=64 \ + --embedding_dim=304 \ + --checkpoint_every 100 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait + +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2e_time" + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +TrainingTime=`grep "TRAIN " $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $6}'` +wait +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'*1000}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "TRAIN " $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $10}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" + + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "TRAIN " $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print $1}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/test/train_RT2_full_1p.sh b/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/test/train_RT2_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..8af5aa9120786387cc3ed72e539e8c0c56e2db32 --- /dev/null +++ b/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/test/train_RT2_full_1p.sh @@ -0,0 +1,193 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Textcnn_ID0123_For_Tensorflow" +#训练epoch +train_epochs=10 +#训练batch_size +batch_size=512 +#学习率 +learning_rate=0.001 +#训练模式 +mode="train_and_eval" +npu_loss_scale=1 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_fp32_to_fp16" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --mode* ]];then + mode=`echo ${para#*=}` + elif [[ $para == --npu_loss_scale* ]];then + npu_loss_scale=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/.. +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + nohup python3 run_cnn_rt.py \ + --mode=${mode} \ + --data_path=${data_path} \ + --num_epochs=${train_epochs} \ + --save_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ + --learning_rate=${learning_rate} \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --batch_size=${batch_size} \ + --profiling=${profiling} \ + --profiling_dump_path=${profiling_dump_path} \ + --npu_loss_scale=${npu_loss_scale} \ + --autotune=${autotune} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +time=(`grep -r "Time: " $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F '(' '{print $NF}' | cut -d ')' -f 1`) +i=${#time[*]} +train_time=`echo "${time[i-1]} ${time[1]} $i"|awk '{print ($1-$2)*10/($3-2)}'` +FPS=`echo "$batch_size $train_time"|awk '{print $1*1000/$2}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "Test Acc:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $NF}'|sed 's/%//g'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'acc' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=$train_time + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "Train Loss:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $5}'|cut -d ',' -f 1 >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log