diff --git a/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2__performance_8p.sh b/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2__performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..85d4d145f7d5b48b9225672fa3af0a4171f62996 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2__performance_8p.sh @@ -0,0 +1,201 @@ +#!/bin/bash + +export JOB_ID=10000 +export ENABLE_RUNTIME_V2=1 + +exec_mode='train' # or 'train_and_eval' +eval_after_training=False + +backbone='resnet50' +backbone_ckpt_path='/npu/traindata/resnet50_ckpt' +data_path='/npu/traindata/coco_official_2017' + +batch_size=2 +steps=1000 + +learning_rate_type='cosine' # or 'step' +learning_rate=0.02 +warmup_learning_rate=0.0067 +warmup_steps=500 +learning_rate_levels='[0.002, 0.0002]' +learning_rate_steps='[60000, 80000]' + +precision_mode='allow_mix_precision' +loss_scale_flag=0 +loss_scale_value=256 +overflow_dump=False + +########## params from command line ########## + +for arg in $* ; do + if [ ${arg:0:2} == '--' ]; then + arg=${arg:2} + pos=`expr index "$arg" =` + if [ $pos > 0 ]; then + var_name=${arg:0:$pos-1} + var_value=${arg:$pos} + eval $var_name=$var_value + fi + fi +done + +for para in $* +do + if [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +if [ ! $output_dir ]; then + output_dir="`pwd`/output/" +fi +echo output_dir=$output_dir + +training_file_pattern=${training_file_pattern:-$data_path'/tfrecord/train*'} +validation_file_pattern=${validation_file_pattern:-$data_path'/tfrecord/val*'} +val_json_file=${val_json_file:-$data_path'/annotations/instances_val2017.json'} + +########## build params_override ########## + +unset params_override +params_override=${params_override}backbone=$backbone, +params_override=${params_override}checkpoint="'$backbone_ckpt_path'", +params_override=${params_override}training_file_pattern="'$training_file_pattern'", +params_override=${params_override}validation_file_pattern="'$validation_file_pattern'", +params_override=${params_override}val_json_file="'$val_json_file'", +params_override=${params_override}train_batch_size=$batch_size, +params_override=${params_override}total_steps=$steps, +params_override=${params_override}learning_rate_type=$learning_rate_type, +params_override=${params_override}init_learning_rate=$learning_rate, +params_override=${params_override}warmup_learning_rate=$warmup_learning_rate, +params_override=${params_override}warmup_steps=$warmup_steps, +params_override=${params_override}learning_rate_levels="'$learning_rate_levels'", +params_override=${params_override}learning_rate_steps="'$learning_rate_steps'", +params_override=${params_override}npu_precision_mode=$precision_mode, +params_override=${params_override}npu_loss_scale_flag=$loss_scale_flag, +params_override=${params_override}npu_loss_scale=$loss_scale_value, +params_override=${params_override}npu_overflow_dump=$overflow_dump, + +echo [params_override] "$params_override" + +########## prepare environment ########## + +export RANK_SIZE=8 +export RANK_ID_START=0 + +BASE_PATH=`cd $(dirname $0); pwd`/../FasterRcnn +echo "BASE_PATH="$BASE_PATH + +export RANK_TABLE_FILE=$BASE_PATH/npu_config/8p.json + +rm -rf /root/ascend/log + +########## run ########## + +start_time=$(date +%s) + +pids= +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + echo + /usr/local/Ascend/driver/tools/msnpureport -d $RANK_ID -g error + + TMP_PATH=$output_dir/$RANK_ID + mkdir -p $TMP_PATH + cd $TMP_PATH + + rm -f configs + ln -s $BASE_PATH/configs configs + + export RANK_ID + export DEVICE_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + bind_core="taskset -c $a-$c" + + ${bind_core} python3 $BASE_PATH/mask_rcnn_main.py --mode=$exec_mode \ + --eval_after_training=$eval_after_training \ + --model_dir=$TMP_PATH/result \ + --num_gpus=$RANK_SIZE \ + --params_override="$params_override" \ + $@ 2>&1 | tee $TMP_PATH/train_${RANK_ID}.log & + + pids[$RANK_ID-$RANK_ID_START]="$RANK_ID $!" + cd - +done + +sleep 1 +echo "########## Waiting for pids: "${pids[*]} + +for pid in "${pids[@]}"; do + pid=($pid) + RANK_ID=${pid[0]} + pid=${pid[1]} + + wait $pid + ret=$? + echo "******************** train finished ******************** $RANK_ID - $pid - ret : $ret" + + ############################## E2E训练时长 ############################## + end_time=$(date +%s) + e2e_time=$(( $end_time - $start_time )) + echo "Final Training Duration sec : $e2e_time" + + ############################## 业务日志 ############################## + grep ERROR /root/ascend/log/plog/plog-${pid}_*.log > $output_dir/$RANK_ID/plog_err.log + + log_file=$output_dir/$RANK_ID/train_${RANK_ID}.log + + ############################## 性能结果处理 ############################## + echo "-------------------- Final result --------------------" + #性能FPS计算,需要根据网络修改 + FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $log_file|awk 'END {print $2}'` + FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${FPS}'*8}'` + echo "Final Performance images/sec : $FPS" + + ############################## 精度结果处理 ############################## + #精度计算,需要根据网络修改 + train_accuracy=`grep "Average Precision" $log_file | awk 'NR==1 {print $NF}'` + if [ $train_accuracy ]; then + echo "Final Training Accuracy mAP: $train_accuracy" + fi + + ############################## 性能看护 ############################## + + Network=FasterRcnn_resnet50_ID0010_for_TensorFlow + + DeviceType=`uname -m` + CaseName=${Network}${name_bind}_${backbone}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + ActualFPS=${FPS} + TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + + # 提取Loss到train_${CaseName}_loss.txt中,需要根据模型修改 + grep "INFO:tensorflow:loss" $log_file|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' >> $output_dir/$RANK_ID/train_${CaseName}_loss.txt + + ActualLoss=`awk 'END {print}' $output_dir/7/train_${CaseName}_loss.txt` + echo "Network = ${Network}" > $output_dir/7/${CaseName}.log + echo "RankSize = ${RANK_SIZE}" >> $output_dir/7/${CaseName}.log + echo "BatchSize = ${batch_size}" >> $output_dir/7/${CaseName}.log + echo "DeviceType = ${DeviceType}" >> $output_dir/7/${CaseName}.log + echo "CaseName = ${CaseName}" >> $output_dir/7/${CaseName}.log + echo "ActualFPS = ${ActualFPS}" >> $output_dir/7/${CaseName}.log + echo "TrainingTime = ${TrainingTime}" >> $output_dir/7/${CaseName}.log + echo "ActualLoss = ${ActualLoss}" >> $output_dir/7/${CaseName}.log + echo "E2ETrainingTime = ${e2e_time}" >> $output_dir/7/${CaseName}.log + if [ $train_accuracy ]; then + echo "TrainAccuracy = ${train_accuracy}" >> $output_dir/7/${CaseName}.log + fi + + #eval版本需求开发中,精度结果临时看护最终的loss + echo "Final Training Accuracy loss: $ActualLoss" +done + +echo "########## copying slog ##########" +cp -r /root/ascend/log/ $output_dir/slog +echo "########## DONE copying slog ##########" diff --git a/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_performance_1p.sh b/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..4c8193f82dc16974ac4f16282a8143264eadab15 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/FasterRcnn_resnet50_ID0010_for_TensorFlow/test/train_RT2_performance_1p.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +export JOB_ID=10000 +export ENABLE_RUNTIME_V2=1 + +exec_mode='train' # or 'train_and_eval' +eval_after_training=False + +backbone='resnet50' +backbone_ckpt_path='/npu/traindata/resnet50_ckpt' +data_path='/npu/traindata/coco_official_2017' + +batch_size=2 +steps=1000 + +learning_rate_type='cosine' # or 'step' +learning_rate=0.003 +warmup_learning_rate=0.00025 +warmup_steps=16000 +learning_rate_levels='[0.0003, 0.00003]' +learning_rate_steps='[480000, 640000]' + +precision_mode='allow_mix_precision' +loss_scale_flag=0 +loss_scale_value=256 +overflow_dump=False + +########## params from command line ########## + +for arg in $* ; do + if [ ${arg:0:2} == '--' ]; then + arg=${arg:2} + pos=`expr index "$arg" =` + if [ $pos > 0 ]; then + var_name=${arg:0:$pos-1} + var_value=${arg:$pos} + eval $var_name=$var_value + fi + fi +done + +if [ ! $output_dir ]; then + output_dir="`pwd`/output/" +fi +echo output_dir=$output_dir + +training_file_pattern=${training_file_pattern:-$data_path'/tfrecord/train*'} +validation_file_pattern=${validation_file_pattern:-$data_path'/tfrecord/val*'} +val_json_file=${val_json_file:-$data_path'/annotations/instances_val2017.json'} + +########## build params_override ########## + +unset params_override +params_override=${params_override}backbone=$backbone, +params_override=${params_override}checkpoint="'$backbone_ckpt_path'", +params_override=${params_override}training_file_pattern="'$training_file_pattern'", +params_override=${params_override}validation_file_pattern="'$validation_file_pattern'", +params_override=${params_override}val_json_file="'$val_json_file'", +params_override=${params_override}train_batch_size=$batch_size, +params_override=${params_override}total_steps=$steps, +params_override=${params_override}learning_rate_type=$learning_rate_type, +params_override=${params_override}init_learning_rate=$learning_rate, +params_override=${params_override}warmup_learning_rate=$warmup_learning_rate, +params_override=${params_override}warmup_steps=$warmup_steps, +params_override=${params_override}learning_rate_levels="'$learning_rate_levels'", +params_override=${params_override}learning_rate_steps="'$learning_rate_steps'", +params_override=${params_override}npu_precision_mode=$precision_mode, +params_override=${params_override}npu_loss_scale_flag=$loss_scale_flag, +params_override=${params_override}npu_loss_scale=$loss_scale_value, +params_override=${params_override}npu_overflow_dump=$overflow_dump, + +echo [params_override] "$params_override" + +########## prepare environment ########## + +export RANK_SIZE=1 + +if [ ! $RANK_ID_START ]; then + if [ $ASCEND_DEVICE_ID ]; then + RANK_ID_START=$ASCEND_DEVICE_ID + elif [ $DEVICE_ID ]; then + RANK_ID_START=$DEVICE_ID + else + RANK_ID_START=0 + fi +fi +export RANK_ID_START +echo "RANK_ID_START="$RANK_ID_START + +BASE_PATH=`cd $(dirname $0); pwd`/../FasterRcnn +echo "BASE_PATH="$BASE_PATH + +########## run ########## + +start_time=$(date +%s) + +pids= +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + echo + /usr/local/Ascend/driver/tools/msnpureport -d $RANK_ID -g error + + TMP_PATH=$output_dir/$RANK_ID + mkdir -p $TMP_PATH + cd $TMP_PATH + + rm -f configs + ln -s $BASE_PATH/configs configs + + export RANK_ID + export DEVICE_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + + python3 $BASE_PATH/mask_rcnn_main.py --mode=$exec_mode \ + --eval_after_training=$eval_after_training \ + --model_dir=$TMP_PATH/result \ + --num_gpus=$RANK_SIZE \ + --params_override="$params_override" \ + $@ 2>&1 | tee $TMP_PATH/train_${RANK_ID}.log & + + pids[$RANK_ID-$RANK_ID_START]="$RANK_ID $!" + cd - +done + +sleep 1 +echo "########## Waiting for pids: "${pids[*]} + +for pid in "${pids[@]}"; do + pid=($pid) + RANK_ID=${pid[0]} + pid=${pid[1]} + + wait $pid + ret=$? + echo "******************** train finished ******************** $RANK_ID - $pid - ret : $ret" + + ############################## E2E训练时长 ############################## + end_time=$(date +%s) + e2e_time=$(( $end_time - $start_time )) + echo "Final Training Duration sec : $e2e_time" + + ############################## 业务日志 ############################## + grep ERROR /root/ascend/log/plog/plog-${pid}_*.log > $output_dir/$RANK_ID/plog_err.log + + log_file=$output_dir/$RANK_ID/train_${RANK_ID}.log + + ############################## 性能结果处理 ############################## + echo "-------------------- Final result --------------------" + #性能FPS计算,需要根据网络修改 + FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $log_file|awk 'END {print $2}'` + FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${FPS}'}'` + echo "Final Performance images/sec : $FPS" + + ############################## 精度结果处理 ############################## + #精度计算,需要根据网络修改 + train_accuracy=`grep "Average Precision" $log_file | awk 'NR==1 {print $NF}'` + if [ $train_accuracy ]; then + echo "Final Training Accuracy mAP: $train_accuracy" + fi + + ############################## 性能看护 ############################## + + Network=FasterRcnn_resnet50_ID0010_for_TensorFlow + + DeviceType=`uname -m` + CaseName=${Network}_${backbone}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + ActualFPS=${FPS} + TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + + # 提取Loss到train_${CaseName}_loss.txt中,需要根据模型修改 + grep "INFO:tensorflow:loss" $log_file|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' >> $output_dir/$RANK_ID/train_${CaseName}_loss.txt + + ActualLoss=`awk 'END {print}' $output_dir/$RANK_ID/train_${CaseName}_loss.txt` + echo "Network = ${Network}" > $output_dir/$RANK_ID/${CaseName}.log + echo "RankSize = ${RANK_SIZE}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "BatchSize = ${batch_size}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "DeviceType = ${DeviceType}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "CaseName = ${CaseName}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "ActualFPS = ${ActualFPS}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "TrainingTime = ${TrainingTime}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "ActualLoss = ${ActualLoss}" >> $output_dir/$RANK_ID/${CaseName}.log + echo "E2ETrainingTime = ${e2e_time}" >> $output_dir/$RANK_ID/${CaseName}.log + if [ $train_accuracy ]; then + echo "TrainAccuracy = ${train_accuracy}" >> $output_dir/$RANK_ID/${CaseName}.log + fi + + #eval版本需求开发中,精度结果临时看护最终的loss + echo "Final Training Accuracy loss: $ActualLoss" +done \ No newline at end of file diff --git a/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_performance_1p.sh b/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..3f0ce9cc497e67ba3663906d0c8efcae721d8d11 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_performance_1p.sh @@ -0,0 +1,224 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' + +#集合通信参数,不需要修改 +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export JOB_ID=9999001 +export RANK_SIZE=1 +export ENABLE_RUNTIME_V2=1 +#export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="MaskRcnn_ID0011_for_TensorFlow" + +batch_size=2 +total_steps=20 + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#autotune时,先开启autotune执行单P训练,不需要修改 +if [[ $autotune == True ]]; then + train_full_1p.sh --autotune=$autotune --data_path=$data_path + wait + autotune=False +fi + +#修改save ckpt,print +sed -i "s|save_checkpoints_steps=90000|save_checkpoints_steps=${total_steps}|g" $cur_path/../distributed_executer.py +sed -i "s|log_step_count_steps=100|log_step_count_steps=1|g" $cur_path/../distributed_executer.py + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + + # 自行添加环境变量 + + export DEVICE_ID=$RANK_ID + DEVICE_INDEX=$DEVICE_ID + export DEVICE_INDEX=${DEVICE_INDEX} + export FUSION_TENSOR_SIZE=1000000000 + # for producible results + export TF_DETERMINISTIC_OPS=1 + export TF_CUDNN_DETERMINISM=1 + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + ${bind_core} python3 mask_rcnn_main.py --mode=train \ + --rank=$RANK_ID \ + --total_steps=$total_steps \ + --Data_path=$data_path \ + --train_batch_size=2 \ + --training_file_pattern=${data_path}/train* \ + --validation_file_pattern=${data_path}/val* \ + --val_json_file=${data_path}/instances_val2017.json \ + --eval_batch_size=2 \ + --model_dir=result_npu\ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + #--data_dump_flag=${data_dump_flag} \ + #--data_dump_step=${data_dump_step} \ + #--data_dump_path=${data_dump_path} \ + #--profiling=${profiling} \ + #--profiling_dump_path=${profiling_dump_path} \ + #--autotune=${autotune} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#参数回改 +sed -i "s|save_checkpoints_steps=${total_steps}|save_checkpoints_steps=90000|g" $cur_path/../distributed_executer.py +sed -i "s|log_step_count_steps=1|log_step_count_steps=100|g" $cur_path/../distributed_executer.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPSper=`grep "] global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $6}'|tail -n +3|awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%f\n",'${batch_size}'*'${RANK_SIZE}'*'${FPSper}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "Average Precision" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|head -1|awk '{print $13}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p_RT2_perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",1/'${FPSper}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "] loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $7}'|cut -d , -f 1 > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_performance_8p.sh b/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..b6fcf9836eb37f49bb77e7d92583602e6db6d4cb --- /dev/null +++ b/TensorFlow/built-in/cv/detection/MaskRcnn_ID0011_for_TensorFlow/test/train_RT2_performance_8p.sh @@ -0,0 +1,228 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' + +#集合通信参数,不需要修改 +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export JOB_ID=9999001 +export RANK_SIZE=8 +export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json +export ENABLE_RUNTIME_V2=1 + +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="MaskRcnn_ID0011_for_TensorFlow" + +batch_size=2 +total_steps=20 + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#autotune时,先开启autotune执行单P训练,不需要修改 +if [[ $autotune == True ]]; then + train_full_1p.sh --autotune=$autotune --data_path=$data_path + wait + autotune=False +fi + +#修改save ckpt,print +sed -i "s|save_checkpoints_steps=90000|save_checkpoints_steps=${total_steps}|g" $cur_path/../distributed_executer.py +sed -i "s|log_step_count_steps=100|log_step_count_steps=1|g" $cur_path/../distributed_executer.py + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + # 自行添加环境变量 + + export DEVICE_ID=$RANK_ID + DEVICE_INDEX=$DEVICE_ID + export DEVICE_INDEX=${DEVICE_INDEX} + export FUSION_TENSOR_SIZE=1000000000 + # for producible results + export TF_DETERMINISTIC_OPS=1 + export TF_CUDNN_DETERMINISM=1 + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + ${bind_core} python3 mask_rcnn_rt_main.py --mode=train \ + --rank=$RANK_ID \ + --total_steps=$total_steps \ + --Data_path=$data_path \ + --train_batch_size=2 \ + --training_file_pattern=${data_path}/train* \ + --validation_file_pattern=${data_path}/val* \ + --val_json_file=${data_path}/instances_val2017.json \ + --eval_batch_size=2 \ + --model_dir=result_npu\ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + #--data_dump_flag=${data_dump_flag} \ + #--data_dump_step=${data_dump_step} \ + #--data_dump_path=${data_dump_path} \ + #--profiling=${profiling} \ + #--profiling_dump_path=${profiling_dump_path} \ + #--autotune=${autotune} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#参数回改 +sed -i "s|save_checkpoints_steps=${total_steps}|save_checkpoints_steps=90000|g" $cur_path/../distributed_executer.py +sed -i "s|log_step_count_steps=1|log_step_count_steps=100|g" $cur_path/../distributed_executer.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +#FPSper=`grep "] global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}'` +FPSper=`grep "] global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $6}'|tail -n 10|awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%f\n",'${batch_size}'*'${RANK_SIZE}'*'${FPSper}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep "Average Precision" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|head -1|awk '{print $13}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p_RT2_perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",1/'${FPSper}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "] loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $7}'|cut -d , -f 1 > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/TrainResNet_rt.py b/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/TrainResNet_rt.py new file mode 100644 index 0000000000000000000000000000000000000000..976408b01f27d5afffe97fa7fd9efd6e58721398 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/TrainResNet_rt.py @@ -0,0 +1,264 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright 2019 Google LLC +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Used to train ResNet-50 +Author: Kaihua Tang +""" +#npu modify begin +from npu_bridge.npu_init import * +#npu modify end +import argparse +import math +import time +import tensorflow as tf +import ResNet as resnet +import numpy as np +import scipy.io as scio +from scipy import misc +from utils import * + +def parse_args(): + desc = "MAIN" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('--label_path', type=str, default='./label/label_1200.npy', help='Path of Label.npy') + parser.add_argument('--image_name_path', type=str, default='./label/name_1200.npy', help='Path of image file names') + ############################add train_data_path################################## + parser.add_argument('--train_data_path', type=str, default='./train_data/1200_data.npy', help='Path of train data') + ############################add train_data_path################################## + parser.add_argument('--parentPath', type=str, default='./CACD2000_Crop/', help='image path') + parser.add_argument('--epochs', type=int, default=100, help='NUM_EPOCHS') + return parser.parse_args() +args = parse_args() + +# image size +WIDTH = 224 +HEIGHT = 224 +CHANNELS = 3 +#"Mini batch size" +MINI_BATCH_SIZE = 32 +#"Path of Label.npy" +label_path = args.label_path +#"Path of image file names" +image_name_path = args.image_name_path +# image path +parentPath = args.parentPath +# train data Path: n * 224 * 224 * 3 numpy matrix +data_path = args.train_data_path + +def dataset_generator(image, label): + for i in range(image.shape[0]): + yield image[i], label[i]-1 + +def make_dataset(allImageData, trainLabelList, batch_size, epoch): + ds = tf.data.Dataset.from_generator(lambda: dataset_generator(allImageData, trainLabelList), + (tf.float32, tf.int32), + (tf.TensorShape([WIDTH, HEIGHT, CHANNELS]), tf.TensorShape([])) + ) + ds = ds.shuffle(buffer_size=100971) + ds = ds.batch(batch_size) + ds = ds.repeat(epoch+1) + ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) + return ds + +def Train(epochs=100): + """ + HyperParameters of the Net + model_path: path of pretrained model, set None if there is no such a model. + LABELSNUM: Number of output labels + learning_rate_orig : original learning rate + NUM_EPOCHS: number of epochs + save_frequency: frequency of saving model (number of epoches) + """ + model_path = None + LABELSNUM = 1200 + learning_rate_orig = 1e-06 + NUM_EPOCHS = epochs + save_frequency = 2 + """ + Classification Layer + final_layer_type: softmax or sigmoid + is_sparse: when final layer is softmax, is it sparse + """ + final_layer_type ="softmax" + is_sparse = True + """ + Tensorboard Setting + tensorboard_on: Turn on Tensorboard or not + TensorBoard_refresh: refresh rate (number of batches) + monitoring_rate: Print output rate + """ + tensorboard_on = False + TensorBoard_refresh = 50 + monitoring_rate = 50 + + #Lists that store name of image and its label + trainNameList = np.load(image_name_path) + trainLabelList = np.load(label_path) + if(data_path is None): + allImageData = load_all_image(trainNameList, HEIGHT, WIDTH, CHANNELS, parentPath, create_npy=True) + else: + allImageData = np.load(data_path) + + #num of total training image + num_train_image = trainLabelList.shape[0] + + #############npu modify start############### + global_config = tf.ConfigProto() + custom_op = global_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + #custom_op.parameter_map["dynamic_input"].b = 1 + #custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile") + global_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + global_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + #with tf.Session() as sess: + with tf.Session(config=global_config) as sess: + train_dataset = make_dataset(allImageData, trainLabelList, MINI_BATCH_SIZE, NUM_EPOCHS) + iterator = train_dataset.make_initializable_iterator() + next_element = iterator.get_next() + #############npu modify end############### + images = tf.placeholder(tf.float32, shape = [None, WIDTH, HEIGHT, CHANNELS]) + if(is_sparse): + labels = tf.placeholder(tf.int64, shape = [None]) + else: + labels = tf.placeholder(tf.float32, shape = [None, LABELSNUM]) + + # build resnet model + resnet_model = resnet.ResNet(ResNet_npy_path = model_path) + resnet_model.build(images, LABELSNUM, final_layer_type) + # number of batches per epoch + # num_minibatches = int(num_train_image / MINI_BATCH_SIZE) + num_minibatches = math.ceil(num_train_image / MINI_BATCH_SIZE) + + # cost function + # learning_rate = learning_rate_orig + with tf.name_scope("cost"): + if(final_layer_type == "sigmoid"): + print("Using weighted sigmoid loss") + loss = tf.nn.weighted_cross_entropy_with_logits(logits = resnet_model.fc1, targets = labels, pos_weight = 5.0) + elif(final_layer_type == "softmax" and is_sparse): + print("Using sparse softmax loss") + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = resnet_model.fc1, labels = labels) + elif(final_layer_type == "softmax" and (not is_sparse)): + print("Using softmax loss") + loss = tf.nn.softmax_cross_entropy_with_logits(logits = resnet_model.fc1, labels = labels) + cost = tf.reduce_sum(loss) + with tf.name_scope("train"): + global_steps = tf.Variable(0, name='global_step', trainable=False) + learning_rate = tf.train.exponential_decay(learning_rate_orig, global_steps, num_minibatches * 40, 0.1, staircase = True) + #train = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) + #train = tf.train.AdamOptimizer(learning_rate).minimize(cost) + #npu modify begin + train = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(cost, global_step=global_steps) + # train = npu_tf_optimizer(tf.train.MomentumOptimizer(learning_rate, 0.9)).minimize(cost, global_step=global_steps) + #npu modify end + + sess.run(tf.global_variables_initializer()) + sess.run(iterator.initializer) + print(resnet_model.get_var_count()) + + if(tensorboard_on): + merged_summary = tf.summary.merge_all() + writer = tf.summary.FileWriter("./TensorBoard/Result") + writer.add_graph(sess.graph) + # used in tensorboard to count record times + summary_times = 0 + + for epoch in range(NUM_EPOCHS): + print("Start Epoch %i" % (epoch + 1)) + start_time = time.time() + minibatch_cost = 0.0 + # count the number of batch + # batch_index = 0 + # get index for all mini batches + # minibatches = random_mini_batches(num_train_image, MINI_BATCH_SIZE, random = True) + + # for minibatch in minibatches: + for batch_index in range(num_minibatches): + # get train examples from each mini batch + # (minibatch_X, minibatch_Y) = get_minibatch(minibatch, trainLabelList, HEIGHT, WIDTH, CHANNELS, LABELSNUM, allImageData, is_sparse) + (minibatch_X, minibatch_Y) = sess.run(next_element) + # change learning rate + print('======================',(sess.run(global_steps))) + #sess.run(global_steps.assign(epoch * num_minibatches + batch_index)) + + # record examples to monitoring the training process + if((batch_index % monitoring_rate == 0)): + resnet_model.set_is_training(False) + fc1, prob = sess.run([resnet_model.fc1, resnet_model.prob], feed_dict={images: minibatch_X}) + countMax = np.sum(np.argmax(prob,1) == minibatch_Y) + print("Epoch %i Batch %i Before Optimization Count %i" %(epoch + 1,batch_index, countMax)) + + # Training and calculating cost + resnet_model.set_is_training(True) + temp_cost, _ = sess.run([cost, train], feed_dict={images: minibatch_X, labels: minibatch_Y}) + minibatch_cost += np.sum(temp_cost) + + # tensorboard + if(tensorboard_on) and (batch_index % TensorBoard_refresh == 0): + s = sess.run(merged_summary, feed_dict={images: minibatch_X, labels: minibatch_Y}) + writer.add_summary(s, summary_times) + summary_times = summary_times + 1 + # record cost in tensorflow + tf.summary.scalar('cost', temp_cost) + + # record examples to monitoring the training process + if((batch_index % monitoring_rate == 0)): + resnet_model.set_is_training(False) + fc1, prob = sess.run([resnet_model.fc1, resnet_model.prob], feed_dict={images: minibatch_X}) + countMax = np.sum(np.argmax(prob,1) == minibatch_Y) + print("Epoch %i Batch %i After Optimization Count %i" %(epoch + 1,batch_index, countMax)) + # Temp Cost & learning rate + print("Epoch %i Batch %i Batch Cost %f Learning_rate %f" %(epoch + 1,batch_index, np.sum(temp_cost), sess.run(learning_rate) * 1e10)) + + # batch_index += 1 + + end_time = time.time() + print("steps_per_s: ", str(num_train_image/(end_time - start_time)/MINI_BATCH_SIZE)) + # print total cost of this epoch + print("End Epoch %i" % (epoch + 1)) + print("Total cost of Epoch %f" % minibatch_cost) + + # save model + if((epoch + 1) % save_frequency == 0): + resnet_model.save_npy(sess, './model/temp-model%i.npy' % (epoch + 1)) + +if __name__ == '__main__': + Train(args.epochs) diff --git a/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/test/train_RT2_performance_1p.sh b/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/test/train_RT2_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..31cd52203451e8d1919c8e2e52e30db591839255 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/Face-ResNet50_ID1372_for_TensorFlow/test/train_RT2_performance_1p.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +cur_path=`pwd`/../ +#失败用例打屏 +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +export ENABLE_RUNTIME_V2=1 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=32 +#网络名称,同目录名称 +Network="Face-ResNet50_ID1372_for_TensorFlow" +#Device数量,单卡默认为1 +RANK_SIZE=1 +#训练epoch,可选 +train_epochs=2 +#训练step +train_steps= +#学习率 +learning_rate= + +#参数配置 +data_path="" +#work_dir="$cur_path/estimator_working_dir" +#export_path="$cur_path/outputs/models/000001-first_generation" + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_performance_1p.sh" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +##############执行训练########## +cd $cur_path +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +#sed -i "s|./CACD2000_Crop/|${data_path}/|g" TrainResNet.py +#sed -i "s|./label|${data_path}/label|g" TrainResNet.py + +start=$(date +%s) +nohup python3 TrainResNet_rt.py \ + --label_path ${data_path}/label/label_1200.npy \ + --image_name_path ${data_path}/label/name_1200.npy \ + --train_data_path ${data_path}/train_data/1200_data.npy \ + --parentPath ${data_path}/CACD2000_Crop/ \ + --epochs 2 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" + + +#输出性能FPS,需要模型审视修改 +steps_per_s=`grep steps_per_s ${cur_path}/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END{print $2}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${steps_per_s}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + + +#输出训练精度,需要模型审视修改 +train_accuracy="None" +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p_RT2_perf' + + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} + + +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'/'${FPS}'}'` + + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Cost $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $7}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + + +#最后一个迭代loss值(Read-Only) +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中(Read-Only) +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/dual_net_rt.py b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/dual_net_rt.py new file mode 100644 index 0000000000000000000000000000000000000000..90b8f4228b3c89b46a0ce3c3eb21ec24324b4500 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/dual_net_rt.py @@ -0,0 +1,850 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright 2018 Google LLC +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The policy and value networks share a majority of their architecture. +This helps the intermediate layers extract concepts that are relevant to both +move prediction and score estimation. +""" +from npu_bridge.npu_init import * + +from absl import flags +import functools +import json +import logging +import os.path +import struct +import tempfile +import time +import numpy as np +import random + +import tensorflow as tf +from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver +from tensorflow.contrib import quantize as contrib_quantize +from tensorflow.contrib import summary as contrib_summary +from tensorflow.contrib import tpu as contrib_tpu +from tensorflow.contrib.tpu.python.tpu import tpu_config as contrib_tpu_python_tpu_tpu_config +from tensorflow.contrib.tpu.python.tpu import tpu_estimator as contrib_tpu_python_tpu_tpu_estimator +from tensorflow.contrib.tpu.python.tpu import tpu_optimizer as contrib_tpu_python_tpu_tpu_optimizer + +import features as features_lib +import go +import symmetries +import minigo_model + + +flags.DEFINE_integer('train_batch_size', 256, + 'Batch size to use for train/eval evaluation. For GPU ' + 'this is batch size as expected. If \"use_tpu\" is set,' + 'final batch size will be = train_batch_size * num_tpu_cores') + +flags.DEFINE_integer('conv_width', 256 if go.N == 19 else 32, + 'The width of each conv layer in the shared trunk.') + +flags.DEFINE_integer('policy_conv_width', 2, + 'The width of the policy conv layer.') + +flags.DEFINE_integer('value_conv_width', 1, + 'The width of the value conv layer.') + +flags.DEFINE_integer('fc_width', 256 if go.N == 19 else 64, + 'The width of the fully connected layer in value head.') + +flags.DEFINE_integer('trunk_layers', go.N, + 'The number of resnet layers in the shared trunk.') + +flags.DEFINE_multi_integer('lr_boundaries', [400000, 600000], + 'The number of steps at which the learning rate will decay') + +flags.DEFINE_multi_float('lr_rates', [0.01, 0.001, 0.0001], + 'The different learning rates') + +flags.DEFINE_integer('training_seed', 0, + 'Random seed to use for training and validation') + +flags.register_multi_flags_validator( + ['lr_boundaries', 'lr_rates'], + lambda flags: len(flags['lr_boundaries']) == len(flags['lr_rates']) - 1, + 'Number of learning rates must be exactly one greater than the number of boundaries') + +flags.DEFINE_float('l2_strength', 1e-4, + 'The L2 regularization parameter applied to weights.') + +flags.DEFINE_float('value_cost_weight', 1.0, + 'Scalar for value_cost, AGZ paper suggests 1/100 for ' + 'supervised learning') + +flags.DEFINE_float('sgd_momentum', 0.9, + 'Momentum parameter for learning rate.') + +flags.DEFINE_string('work_dir', None, + 'The Estimator working directory. Used to dump: ' + 'checkpoints, tensorboard logs, etc..') + +flags.DEFINE_bool('use_tpu', False, 'Whether to use TPU for training.') + +flags.DEFINE_string( + 'tpu_name', None, + 'The Cloud TPU to use for training. This should be either the name used' + 'when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.') + +flags.DEFINE_integer( + 'num_tpu_cores', default=8, + help=('Number of TPU cores. For a single TPU device, this is 8 because each' + ' TPU has 4 chips each with 2 cores.')) + +flags.DEFINE_string('gpu_device_list', None, + 'Comma-separated list of GPU device IDs to use.') + +flags.DEFINE_bool('quantize', False, + 'Whether create a quantized model. When loading a model for ' + 'inference, this must match how the model was trained.') + +flags.DEFINE_integer('quant_delay', 700 * 1024, + 'Number of training steps after which weights and ' + 'activations are quantized.') + +flags.DEFINE_integer( + 'iterations_per_loop', 128, + help=('Number of steps to run on TPU before outfeeding metrics to the CPU.' + ' If the number of iterations in the loop would exceed the number of' + ' train steps, the loop will exit before reaching' + ' --iterations_per_loop. The larger this value is, the higher the' + ' utilization on the TPU.')) + +flags.DEFINE_integer( + 'summary_steps', default=256, + help='Number of steps between logging summary scalars.') + +flags.DEFINE_integer( + 'keep_checkpoint_max', default=5, help='Number of checkpoints to keep.') + +flags.DEFINE_bool( + 'use_random_symmetry', True, + help='If true random symmetries be used when doing inference.') + +flags.DEFINE_bool( + 'use_SE', False, + help='Use Squeeze and Excitation.') + +flags.DEFINE_bool( + 'use_SE_bias', False, + help='Use Squeeze and Excitation with bias.') + +flags.DEFINE_integer( + 'SE_ratio', 2, + help='Squeeze and Excitation ratio.') + +flags.DEFINE_bool( + 'use_swish', False, + help=('Use Swish activation function inplace of ReLu. ' + 'https://arxiv.org/pdf/1710.05941.pdf')) + +flags.DEFINE_bool( + 'bool_features', False, + help='Use bool input features instead of float') + +flags.DEFINE_string( + 'input_features', 'agz', + help='Type of input features: "agz" or "mlperf07"') + +flags.DEFINE_string( + 'input_layout', 'nhwc', + help='Layout of input features: "nhwc" or "nchw"') + +flags.DEFINE_string( + 'dynamic_input', '1', + help='--dynamic_input=1 Use fuzzy compilation. --dynamic_input=lazy_recompile Compile using lazy static graph') + +# TODO(seth): Verify if this is still required. +flags.register_multi_flags_validator( + ['use_tpu', 'iterations_per_loop', 'summary_steps'], + lambda flags: (not flags['use_tpu'] or + flags['summary_steps'] % flags['iterations_per_loop'] == 0), + 'If use_tpu, summary_steps must be a multiple of iterations_per_loop') + +FLAGS = flags.FLAGS + + +class DualNetwork(): + def __init__(self, save_file): + ############################ set dynamic_input = True start########################################### + #set dynamic_input = True + global_config = tf.ConfigProto() + custom_op = global_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + #custom_op.parameter_map["dynamic_input"].b = True + print('========= DualNetwork DYNAMIC INPUT = %s =========' % FLAGS.dynamic_input) + #if FLAGS.dynamic_input == "lazy_recompile": + # custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile") + #if FLAGS.dynamic_input == "1": + # custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("dynamic_execute") + #else: + # print("Enter correct compilation parameters.") + global_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + global_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + self.save_file = save_file + self.inference_input = None + self.inference_output = None + config = npu_config_proto(config_proto=global_config) + ############################ set dynamic_input = True end########################################### + + config.gpu_options.allow_growth = True + if FLAGS.gpu_device_list is not None: + config.gpu_options.visible_device_list = FLAGS.gpu_device_list + self.sess = tf.Session(config=npu_config_proto(config_proto=config), graph=tf.Graph()) + self.initialize_graph() + + def initialize_graph(self): + with self.sess.graph.as_default(): + features, labels = get_inference_input() + params = FLAGS.flag_values_dict() + logging.info('TPU inference is supported on C++ only. ' + 'DualNetwork will ignore use_tpu=True') + params['use_tpu'] = False + estimator_spec = model_fn(features, labels, + tf.estimator.ModeKeys.PREDICT, + params=params) + self.inference_input = features + self.inference_output = estimator_spec.predictions + if self.save_file is not None: + self.initialize_weights(self.save_file) + else: + self.sess.run(tf.global_variables_initializer()) + + def initialize_weights(self, save_file): + """Initialize the weights from the given save_file. + Assumes that the graph has been constructed, and the + save_file contains weights that match the graph. Used + to set the weights to a different version of the player + without redifining the entire graph.""" + tf.train.Saver().restore(self.sess, save_file) + + def run(self, position): + probs, values = self.run_many([position]) + return probs[0], values[0] + + def run_many(self, positions): + f = get_features() + processed = [features_lib.extract_features(p, f) for p in positions] + if FLAGS.use_random_symmetry: + syms_used, processed = symmetries.randomize_symmetries_feat( + processed) + outputs = self.sess.run(self.inference_output, + feed_dict={self.inference_input: processed}) + probabilities, value = outputs['policy_output'], outputs['value_output'] + if FLAGS.use_random_symmetry: + probabilities = symmetries.invert_symmetries_pi( + syms_used, probabilities) + return probabilities, value + + +def get_features_planes(): + if FLAGS.input_features == 'agz': + return features_lib.AGZ_FEATURES_PLANES + elif FLAGS.input_features == 'mlperf07': + return features_lib.MLPERF07_FEATURES_PLANES + else: + raise ValueError('unrecognized input features "%s"' % + FLAGS.input_features) + + +def get_features(): + if FLAGS.input_features == 'agz': + return features_lib.AGZ_FEATURES + elif FLAGS.input_features == 'mlperf07': + return features_lib.MLPERF07_FEATURES + else: + raise ValueError('unrecognized input features "%s"' % + FLAGS.input_features) + + +def get_inference_input(): + """Set up placeholders for input features/labels. + + Returns the feature, output tensors that get passed into model_fn.""" + feature_type = tf.bool if FLAGS.bool_features else tf.float32 + if FLAGS.input_layout == 'nhwc': + feature_shape = [None, go.N, go.N, get_features_planes()] + elif FLAGS.input_layout == 'nchw': + feature_shape = [None, get_features_planes(), go.N, go.N] + else: + raise ValueError('invalid input_layout "%s"' % FLAGS.input_layout) + return (tf.placeholder(feature_type, feature_shape, name='pos_tensor'), + {'pi_tensor': tf.placeholder(tf.float32, [None, go.N * go.N + 1]), + 'value_tensor': tf.placeholder(tf.float32, [None])}) + + +def model_fn(features, labels, mode, params): + """ + Create the model for estimator api + + Args: + features: if input_layout == 'nhwc', a tensor with shape: + [BATCH_SIZE, go.N, go.N, get_features_planes()] + else, a tensor with shape: + [BATCH_SIZE, get_features_planes(), go.N, go.N] + labels: dict from string to tensor with shape + 'pi_tensor': [BATCH_SIZE, go.N * go.N + 1] + 'value_tensor': [BATCH_SIZE] + mode: a tf.estimator.ModeKeys (batchnorm params update for TRAIN only) + params: A dictionary (Typically derived from the FLAGS object.) + Returns: tf.estimator.EstimatorSpec with props + mode: same as mode arg + predictions: dict of tensors + 'policy': [BATCH_SIZE, go.N * go.N + 1] + 'value': [BATCH_SIZE] + loss: a single value tensor + train_op: train op + eval_metric_ops + return dict of tensors + logits: [BATCH_SIZE, go.N * go.N + 1] + """ + + policy_output, value_output, logits = model_inference_fn( + features, mode == tf.estimator.ModeKeys.TRAIN, params) + + # train ops + policy_cost = tf.reduce_mean( + tf.nn.softmax_cross_entropy_with_logits_v2( + logits=logits, labels=tf.stop_gradient(labels['pi_tensor']))) + + value_cost = params['value_cost_weight'] * tf.reduce_mean( + tf.square(value_output - labels['value_tensor'])) + + reg_vars = [v for v in tf.trainable_variables() + if 'bias' not in v.name and 'beta' not in v.name] + l2_cost = params['l2_strength'] * \ + tf.add_n([tf.nn.l2_loss(v) for v in reg_vars]) + + combined_cost = policy_cost + value_cost + l2_cost + + global_step = tf.train.get_or_create_global_step() + learning_rate = tf.train.piecewise_constant( + global_step, params['lr_boundaries'], params['lr_rates']) + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + + # Insert quantization ops if requested + if params['quantize']: + if mode == tf.estimator.ModeKeys.TRAIN: + contrib_quantize.create_training_graph( + quant_delay=params['quant_delay']) + else: + contrib_quantize.create_eval_graph() + + ######################################## NPU_8p start ##################################### + #optimizer = npu_tf_optimizer(tf.train.MomentumOptimizer( + #learning_rate, params['sgd_momentum'])) + optimizer = NPUDistributedOptimizer(tf.train.MomentumOptimizer( + learning_rate, params['sgd_momentum'])) + ######################################## NPU_8p end ##################################### + + if params['use_tpu']: + optimizer = contrib_tpu_python_tpu_tpu_optimizer.CrossShardOptimizer( + optimizer) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(combined_cost, global_step=global_step) + + # Computations to be executed on CPU, outside of the main TPU queues. + def eval_metrics_host_call_fn(policy_output, value_output, pi_tensor, + value_tensor, policy_cost, value_cost, + l2_cost, combined_cost, step, + est_mode=tf.estimator.ModeKeys.TRAIN): + policy_entropy = -tf.reduce_mean(tf.reduce_sum( + policy_output * tf.log(policy_output), axis=1)) + # pi_tensor is one_hot when generated from sgfs (for supervised learning) + # and soft-max when using self-play records. argmax normalizes the two. + policy_target_top_1 = tf.argmax(pi_tensor, axis=1) + + policy_output_in_top1 = tf.to_float( + tf.nn.in_top_k(policy_output, policy_target_top_1, k=1)) + policy_output_in_top3 = tf.to_float( + tf.nn.in_top_k(policy_output, policy_target_top_1, k=3)) + + policy_top_1_confidence = tf.reduce_max(policy_output, axis=1) + policy_target_top_1_confidence = tf.boolean_mask( + policy_output, + tf.one_hot(policy_target_top_1, tf.shape(policy_output)[1])) + + value_cost_normalized = value_cost / params['value_cost_weight'] + avg_value_observed = tf.reduce_mean(value_tensor) + + with tf.variable_scope('metrics'): + metric_ops = { + 'policy_cost': tf.metrics.mean(policy_cost), + 'value_cost': tf.metrics.mean(value_cost), + 'value_cost_normalized': tf.metrics.mean(value_cost_normalized), + 'l2_cost': tf.metrics.mean(l2_cost), + 'policy_entropy': tf.metrics.mean(policy_entropy), + 'combined_cost': tf.metrics.mean(combined_cost), + 'avg_value_observed': tf.metrics.mean(avg_value_observed), + 'policy_accuracy_top_1': tf.metrics.mean(policy_output_in_top1), + 'policy_accuracy_top_3': tf.metrics.mean(policy_output_in_top3), + 'policy_top_1_confidence': tf.metrics.mean(policy_top_1_confidence), + 'policy_target_top_1_confidence': tf.metrics.mean( + policy_target_top_1_confidence), + 'value_confidence': tf.metrics.mean(tf.abs(value_output)), + } + + if est_mode == tf.estimator.ModeKeys.EVAL: + return metric_ops + + # NOTE: global_step is rounded to a multiple of FLAGS.summary_steps. + eval_step = tf.reduce_min(step) + + # Create summary ops so that they show up in SUMMARIES collection + # That way, they get logged automatically during training + + ######################################## host_call_fn: start ##################################### + #summary_writer = contrib_summary.create_file_writer(FLAGS.work_dir) + #with summary_writer.as_default(), contrib_summary.record_summaries_every_n_global_steps(params['summary_steps'], eval_step): + #for metric_name, metric_op in metric_ops.items(): + #contrib_summary.scalar(metric_name, metric_op[1], step=eval_step) + def host_call_fn(work_dir, metric_ops, eval_step): + with contrib_summary.create_file_writer(work_dir, max_queue=params['iterations_per_loop']).as_default(): + with contrib_summary.record_summaries_every_n_global_steps(params['summary_steps'], eval_step): + for metric_name, metric_op in metric_ops.items(): + contrib_summary.scalar(metric_name, metric_op[1], step=eval_step) + return contrib_summary.all_summary_ops() + ######################################## host_call_fn: end ##################################### + + # Reset metrics occasionally so that they are mean of recent batches. + reset_op = tf.variables_initializer(tf.local_variables('metrics')) + cond_reset_op = tf.cond( + tf.equal(eval_step % params['summary_steps'], tf.to_int64(1)), + lambda: reset_op, + lambda: tf.no_op()) + + ######################################## host_call_fn: start ##################################### + #return contrib_summary.all_summary_ops() + [cond_reset_op] + return host_call_fn(FLAGS.work_dir, metric_ops, eval_step) + [cond_reset_op] + ######################################## host_call_fn: end ##################################### + + metric_args = [ + policy_output, + value_output, + labels['pi_tensor'], + labels['value_tensor'], + tf.reshape(policy_cost, [1]), + tf.reshape(value_cost, [1]), + tf.reshape(l2_cost, [1]), + tf.reshape(combined_cost, [1]), + tf.reshape(global_step, [1]), + ] + + predictions = { + 'policy_output': policy_output, + 'value_output': value_output, + } + + eval_metrics_only_fn = functools.partial( + eval_metrics_host_call_fn, est_mode=tf.estimator.ModeKeys.EVAL) + host_call_fn = functools.partial( + eval_metrics_host_call_fn, est_mode=tf.estimator.ModeKeys.TRAIN) + + ######################################## host_call_fn: start ##################################### + #tpu_estimator_spec = contrib_tpu_python_tpu_tpu_estimator.TPUEstimatorSpec( + tpu_estimator_spec = NPUEstimatorSpec( + mode=mode, + predictions=predictions, + loss=combined_cost, + train_op=train_op, + #eval_metrics=(eval_metrics_only_fn, metric_args), + host_call=(host_call_fn, metric_args) + ) + return tpu_estimator_spec + #if params['use_tpu']: + # return tpu_estimator_spec + #else: + # return tpu_estimator_spec.as_estimator_spec() + ######################################## host_call_fn: end ##################################### + + +def model_inference_fn(features, training, params): + """Builds just the inference part of the model graph. + + Args: + features: input features tensor. + training: True if the model is training. + params: A dictionary + + Returns: + (policy_output, value_output, logits) tuple of tensors. + """ + + if FLAGS.bool_features: + features = tf.dtypes.cast(features, dtype=tf.float32) + + if FLAGS.input_layout == 'nhwc': + bn_axis = -1 + data_format = 'channels_last' + else: + bn_axis = 1 + data_format = 'channels_first' + + mg_batchn = functools.partial( + tf.layers.batch_normalization, + axis=bn_axis, + momentum=.95, + epsilon=1e-5, + center=True, + scale=True, + fused=True, + training=training) + + mg_conv2d = functools.partial( + tf.layers.conv2d, + filters=params['conv_width'], + kernel_size=3, + padding='same', + use_bias=False, + data_format=data_format) + + mg_global_avgpool2d = functools.partial( + tf.layers.average_pooling2d, + pool_size=go.N, + strides=1, + padding='valid', + data_format=data_format) + + def mg_activation(inputs): + if FLAGS.use_swish: + return tf.nn.swish(inputs) + + return tf.nn.relu(inputs) + + def residual_inner(inputs): + conv_layer1 = mg_batchn(mg_conv2d(inputs)) + initial_output = mg_activation(conv_layer1) + conv_layer2 = mg_batchn(mg_conv2d(initial_output)) + return conv_layer2 + + def mg_res_layer(inputs): + residual = residual_inner(inputs) + output = mg_activation(inputs + residual) + return output + + def mg_squeeze_excitation_layer(inputs): + # Hu, J., Shen, L., & Sun, G. (2018). Squeeze-and-Excitation Networks. + # 2018 IEEE/CVF Conference on Computer Vision, 7132-7141. + # arXiv:1709.01507 [cs.CV] + + channels = params['conv_width'] + ratio = FLAGS.SE_ratio + assert channels % ratio == 0 + + residual = residual_inner(inputs) + pool = mg_global_avgpool2d(residual) + fc1 = tf.layers.dense(pool, units=channels // ratio) + squeeze = mg_activation(fc1) + + if FLAGS.use_SE_bias: + fc2 = tf.layers.dense(squeeze, units=2*channels) + # Channels_last so axis = 3 = -1 + gamma, bias = tf.split(fc2, 2, axis=3) + else: + gamma = tf.layers.dense(squeeze, units=channels) + bias = 0 + + sig = tf.nn.sigmoid(gamma) + # Explicitly signal the broadcast. + scale = tf.reshape(sig, [-1, 1, 1, channels]) + + excitation = tf.multiply(scale, residual) + bias + return mg_activation(inputs + excitation) + + initial_block = mg_activation(mg_batchn(mg_conv2d(features))) + + # the shared stack + shared_output = initial_block + for _ in range(params['trunk_layers']): + if FLAGS.use_SE or FLAGS.use_SE_bias: + shared_output = mg_squeeze_excitation_layer(shared_output) + else: + shared_output = mg_res_layer(shared_output) + + # Policy head + policy_conv = mg_conv2d( + shared_output, filters=params['policy_conv_width'], kernel_size=1) + policy_conv = mg_activation( + mg_batchn(policy_conv, center=False, scale=False)) + logits = tf.layers.dense( + tf.reshape( + policy_conv, [-1, params['policy_conv_width'] * go.N * go.N]), + go.N * go.N + 1) + + policy_output = tf.nn.softmax(logits, name='policy_output') + + # Value head + value_conv = mg_conv2d( + shared_output, filters=params['value_conv_width'], kernel_size=1) + value_conv = mg_activation( + mg_batchn(value_conv, center=False, scale=False)) + + value_fc_hidden = mg_activation(tf.layers.dense( + tf.reshape(value_conv, [-1, params['value_conv_width'] * go.N * go.N]), + params['fc_width'])) + value_output = tf.nn.tanh( + tf.reshape(tf.layers.dense(value_fc_hidden, 1), [-1]), + name='value_output') + + return policy_output, value_output, logits + + +def tpu_model_inference_fn(features): + """Builds the model graph suitable for running on TPU. + + It does two things: + 1) Mark all weights as constant, which improves TPU inference performance + because it prevents the weights being transferred to the TPU every call + to Session.run(). + 2) Adds constant to the graph with a unique value and marks it as a + dependency on the rest of the model. This works around a TensorFlow bug + that prevents multiple models being run on a single TPU. + + Returns: + (policy_output, value_output, logits) tuple of tensors. + """ + def custom_getter(getter, name, *args, **kwargs): + with tf.control_dependencies(None): + return tf.guarantee_const( + getter(name, *args, **kwargs), name=name + '/GuaranteeConst') + with tf.variable_scope('', custom_getter=custom_getter): + # TODO(tommadams): remove the tf.control_dependencies context manager + # when a fixed version of TensorFlow is released. + t = int(time.time()) + epoch_time = tf.constant(t, name='epoch_time_%d' % t) + with tf.control_dependencies([epoch_time]): + if FLAGS.input_layout == 'nhwc': + feature_shape = [-1, go.N, go.N, get_features_planes()] + else: + feature_shape = [-1, get_features_planes(), go.N, go.N] + features = tf.reshape(features, feature_shape) + return model_inference_fn(features, False, FLAGS.flag_values_dict()) + + +def maybe_set_seed(): + if FLAGS.training_seed != 0: + random.seed(FLAGS.training_seed) + tf.set_random_seed(FLAGS.training_seed) + np.random.seed(FLAGS.training_seed) + + +def get_estimator(): + if FLAGS.use_tpu: + return _get_tpu_estimator() + else: + return _get_nontpu_estimator() + + +def _get_nontpu_estimator(): + session_config = tf.ConfigProto(allow_soft_placement=True) + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = 'NpuOptimizer' + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + #session_config = npu_config_proto(config_proto=tf.ConfigProto()) + session_config.gpu_options.allow_growth = True + + run_config = tf.estimator.RunConfig( + save_summary_steps=FLAGS.summary_steps, + keep_checkpoint_max=FLAGS.keep_checkpoint_max, + session_config=session_config) + return tf.estimator.Estimator( + model_fn, + model_dir=FLAGS.work_dir, + config=npu_run_config_init(run_config=run_config), + params=FLAGS.flag_values_dict()) + + +def _get_tpu_estimator(): + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=None, project=None) + tpu_grpc_url = tpu_cluster_resolver.get_master() + + run_config = contrib_tpu_python_tpu_tpu_config.RunConfig( + master=tpu_grpc_url, + evaluation_master=tpu_grpc_url, + model_dir=FLAGS.work_dir, + save_checkpoints_steps=max(1000, FLAGS.iterations_per_loop), + save_summary_steps=FLAGS.summary_steps, + keep_checkpoint_max=FLAGS.keep_checkpoint_max, + session_config=npu_config_proto( + config_proto=tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=True,) + ), + tpu_config=contrib_tpu_python_tpu_tpu_config.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=contrib_tpu_python_tpu_tpu_config.InputPipelineConfig.PER_HOST_V2)) + + ######################################## host_call_fn: start ##################################### + #return contrib_tpu_python_tpu_tpu_estimator.TPUEstimator( + return NPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=npu_run_config_init(run_config=run_config), + train_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores, + eval_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores, + params=FLAGS.flag_values_dict()) + ######################################## host_call_fn: end ##################################### + +def bootstrap(): + """Initialize a tf.Estimator run with random initial weights.""" + # a bit hacky - forge an initial checkpoint with the name that subsequent + # Estimator runs will expect to find. + # + # Estimator will do this automatically when you call train(), but calling + # train() requires data, and I didn't feel like creating training data in + # order to run the full train pipeline for 1 step. + maybe_set_seed() + initial_checkpoint_name = 'model.ckpt-1' + save_file = os.path.join(FLAGS.work_dir, initial_checkpoint_name) + sess = tf.Session(config=npu_config_proto(), graph=tf.Graph()) + with sess.graph.as_default(): + features, labels = get_inference_input() + model_fn(features, labels, tf.estimator.ModeKeys.PREDICT, + params=FLAGS.flag_values_dict()) + sess.run(tf.global_variables_initializer()) + tf.train.Saver().save(sess, save_file) + + +def export_model(model_path): + """Take the latest checkpoint and copy it to model_path. + + Assumes that all relevant model files are prefixed by the same name. + (For example, foo.index, foo.meta and foo.data-00000-of-00001). + + Args: + model_path: The path (can be a gs:// path) to export model + """ + estimator = tf.estimator.Estimator(model_fn, model_dir=FLAGS.work_dir, + params=FLAGS.flag_values_dict(), config=npu_run_config_init()) + latest_checkpoint = estimator.latest_checkpoint() + all_checkpoint_files = tf.gfile.Glob(latest_checkpoint + '*') + for filename in all_checkpoint_files: + suffix = filename.partition(latest_checkpoint)[2] + destination_path = model_path + suffix + print('Copying {} to {}'.format(filename, destination_path)) + tf.gfile.Copy(filename, destination_path) + + +def freeze_graph(model_path, use_trt=False, trt_max_batch_size=8, + trt_precision='fp32'): + output_names = ['policy_output', 'value_output'] + + n = DualNetwork(model_path) + out_graph = tf.graph_util.convert_variables_to_constants( + n.sess, n.sess.graph.as_graph_def(), output_names) + + if use_trt: + import tensorflow.contrib.tensorrt as trt + out_graph = trt.create_inference_graph( + input_graph_def=out_graph, + outputs=output_names, + max_batch_size=trt_max_batch_size, + max_workspace_size_bytes=1 << 29, + precision_mode=trt_precision) + + metadata = make_model_metadata({ + 'engine': 'tf', + 'use_trt': bool(use_trt), + }) + + minigo_model.write_graph_def(out_graph, metadata, model_path + '.minigo') + + +def freeze_graph_tpu(model_path): + """Custom freeze_graph implementation for Cloud TPU.""" + + assert model_path + assert FLAGS.tpu_name + if FLAGS.tpu_name.startswith('grpc://'): + tpu_grpc_url = FLAGS.tpu_name + else: + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=None, project=None) + tpu_grpc_url = tpu_cluster_resolver.get_master() + sess = tf.Session(tpu_grpc_url, config=npu_config_proto()) + + output_names = [] + with sess.graph.as_default(): + # Replicate the inference function for each TPU core. + replicated_features = [] + feature_type = tf.bool if FLAGS.bool_features else tf.float32 + for i in range(FLAGS.num_tpu_cores): + name = 'pos_tensor_%d' % i + features = tf.placeholder( + feature_type, [None], name=name) + replicated_features.append((features,)) + outputs = contrib_tpu.replicate( + tpu_model_inference_fn, replicated_features) + + # The replicate op assigns names like output_0_shard_0 to the output + # names. Give them human readable names. + for i, (policy_output, value_output, _) in enumerate(outputs): + policy_name = 'policy_output_%d' % i + value_name = 'value_output_%d' % i + output_names.extend([policy_name, value_name]) + tf.identity(policy_output, policy_name) + tf.identity(value_output, value_name) + + tf.train.Saver().restore(sess, model_path) + + out_graph = tf.graph_util.convert_variables_to_constants( + sess, sess.graph.as_graph_def(), output_names) + + metadata = make_model_metadata({ + 'engine': 'tpu', + 'num_replicas': FLAGS.num_tpu_cores, + }) + + minigo_model.write_graph_def(out_graph, metadata, model_path + '.minigo') + + +def make_model_metadata(metadata): + for f in ['conv_width', 'fc_width', 'trunk_layers', 'use_SE', 'use_SE_bias', + 'use_swish', 'input_features', 'input_layout']: + metadata[f] = getattr(FLAGS, f) + metadata['input_type'] = 'bool' if FLAGS.bool_features else 'float' + metadata['board_size'] = go.N + return metadata diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_1p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..61dffddbd77fdbd0e1ea2f5d8e98ed5467928867 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_1p.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +cur_path=`pwd`/../ +rm -f $cur_path/outputs/models/* +rm -f $cur_path/estimator_working_dir/* + +export ENABLE_RUNTIME_V2=1 +#基础参数,需要模型审视修改 +#Batch Size +batch_size=128 +#网络名称,同目录名称 +Network="MiniGo_ID0629_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=1 +#训练epoch,可选 +train_epochs= +#训练step +train_steps=500 +#学习率 +learning_rate= +#动态输入模式,不需要修改 +dynamic_input="" + +#参数配置 npu param +precision_mode="allow_fp32_to_fp16" +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +data_path="$./outputs/data/selfplay" + +if [[ $1 == --help || $1 == -h ]];then + echo "usage: ./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation"" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --dynamic_input* ]];then + dynamic_input=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +##############执行训练########## +cd $cur_path + +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +#(Step1)初始化 一定要先运行这一步 +python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000000-bootstrap +wait + +start=$(date +%s) +#(Step3)训练 +#python3 train.py --training_data_path=$data_path --steps_to_train=$train_steps --train_batch_size=$batch_size --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000001-first_generation > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +python3 train_rt.py \ + --training_data_path=$data_path \ + --steps_to_train=$train_steps \ + --train_batch_size=$batch_size \ + --work_dir=$cur_path/estimator_working_dir \ + --export_path=$cur_path/outputs/models/000001-first_generation \ + --dynamic_input=${dynamic_input} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2etime=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2etime" + + +###下面字段用于冒烟看护 +BatchSize=${batch_size} +#设备类型,自动获取 +DeviceType=`uname -m` +#用例名称,自动获取 +CaseName=${Network}_bs${BatchSize}_${RankSize}'p_RT2_perf' + +#获取性能 +TrainingTime=`grep "tensorflow:global_step/sec" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +wait +ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${BatchSize}'*'${TrainingTime}'}'` + +#从train_*.log中提取Loss到${CaseName}_loss.txt中 +grep "] loss" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $7}' |cut -d , -f 1 >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt +ActualLoss=`awk 'END {print $1}' $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt` + +#关键信息打印到CaseName.log中,此处无需修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RankSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2etime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DynamicInput = ${dynamic_input}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..fdefa5091c42dbb6e74de5b1dae751ad40883299 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +cur_path=`pwd`/../ +rm -f $cur_path/outputs/models/* +rm -f $cur_path/estimator_working_dir/* + +export ENABLE_RUNTIME_V2=1 +#基础参数,需要模型审视修改 +#Batch Size +batch_size=128 +#网络名称,同目录名称 +Network="MiniGo_ID0629_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=8 +#训练epoch,可选 +train_epochs= +#训练step +train_steps=500 +#学习率 +learning_rate= +#动态输入模式,不需要修改 +dynamic_input="" + +#参数配置 npu param +precision_mode="allow_fp32_to_fp16" +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +data_path="$./outputs/data/selfplay" + +if [[ $1 == --help || $1 == -h ]];then + echo "usage: ./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation"" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --dynamic_input* ]];then + dynamic_input=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +##############执行训练########## +cd $cur_path + +#(Step1)初始化 一定要先运行这一步 +python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000000-bootstrap +wait + +export ASCEND_DEVICE_ID=0 +export RANK_SIZE=8 +export RANK_TABLE_FILE="${cur_path}/test/8p.json" +export JOB_ID=10086 + +start=$(date +%s) + +# 8P训练模式 +for i in 0 1 2 3 4 5 6 7 +do + #设置环境变量 + export RANK_ID=$i + export ASCEND_DEVICE_ID=$i + ASCEND_DEVICE_ID=$i + echo "Device ID: $ASCEND_DEVICE_ID" + + if [ -d $cur_path/test/output/$ASCEND_DEVICE_ID ];then + rm -rf $cur_path/test/output/$ASCEND_DEVICE_ID + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID + else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID + fi + echo $ASCEND_DEVICE_ID + #(Step3)训练 + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + #${bind_core} python3 train.py --training_data_path=$data_path --steps_to_train=$train_steps --train_batch_size=$batch_size --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000001-first_generation > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + ${bind_core} python3 train_rt.py \ + --training_data_path=$data_path \ + --steps_to_train=$train_steps \ + --train_batch_size=$batch_size \ + --work_dir=$cur_path/estimator_working_dir \ + --export_path=$cur_path/outputs/models/000001-first_generation \ + --dynamic_input=${dynamic_input} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait + +end=$(date +%s) +e2etime=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2etime" + + +###下面字段用于冒烟看护 +BatchSize=${batch_size} +#设备类型,自动获取 +DeviceType=`uname -m` +#用例名称,自动获取 +CaseName=${Network}_bs${BatchSize}_${RankSize}'p_RT2_perf' + +#获取性能 +TrainingTime=`grep "tensorflow:global_step/sec" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +wait +ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${BatchSize}'*'${RankSize}'*'${TrainingTime}'}'` + +#从train_*.log中提取Loss到${CaseName}_loss.txt中 +grep "] loss" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $7}' |cut -d , -f 1 >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt +ActualLoss=`awk 'END {print $1}' $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt` + +#关键信息打印到CaseName.log中,此处无需修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RankSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2etime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DynamicInput = ${dynamic_input}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/train_rt.py b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/train_rt.py new file mode 100644 index 0000000000000000000000000000000000000000..8cafcdafe0e8c667da91d61f73aa26c495c764f6 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/train_rt.py @@ -0,0 +1,315 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright 2018 Google LLC +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Train a network. + +Usage: + BOARD_SIZE=19 python train.py tfrecord1 tfrecord2 tfrecord3 +""" +from npu_bridge.npu_init import * + +#########################Solve Argument list too long : strat ######################### +import os +#########################Solve Argument list too long : end ######################### + +import logging +import math + +from absl import app, flags +import numpy as np +import tensorflow as tf + +import bigtable_input +import dual_net_rt +import preprocessing +import utils + +# See www.moderndescartes.com/essays/shuffle_viz for discussion on sizing +flags.DEFINE_integer('shuffle_buffer_size', 2000, + 'Size of buffer used to shuffle train examples.') + +flags.DEFINE_boolean('shuffle_examples', True, + 'Whether to shuffle training examples.') + +flags.DEFINE_integer('steps_to_train', None, + 'Number of training steps to take. If not set, iterates ' + 'once over training data.') + +flags.DEFINE_integer('num_examples', None, + 'Total number of input examples. This is only used if ' + 'steps_to_train is not set. Requires that filter_amount ' + 'is 1.0.') + +flags.DEFINE_integer('window_size', 500000, + 'Number of games to include in the window') + +flags.DEFINE_float('filter_amount', 1.0, + 'Fraction of positions to filter from golden chunks,' + 'default, 1.0 (no filter)') + +flags.DEFINE_string('export_path', None, + 'Where to export the model after training.') + +################## Solve Argument List Too long: start ################## +flags.DEFINE_string('training_data_path', None, + 'training data path.') +################## Solve Argument List Too long: end ################## + +flags.DEFINE_bool('use_bt', False, + 'Whether to use Bigtable as input. ' + '(Only supported with --use_tpu, currently.)') + +flags.DEFINE_bool('freeze', False, + 'Whether to freeze the graph at the end of training.') + +flags.DEFINE_boolean( + 'use_trt', False, 'True to write a GraphDef that uses the TRT runtime') +flags.DEFINE_integer('trt_max_batch_size', None, + 'Maximum TRT batch size') +flags.DEFINE_string('trt_precision', 'fp32', + 'Precision for TRT runtime: fp16, fp32 or int8') +flags.register_multi_flags_validator( + ['use_trt', 'trt_max_batch_size'], + lambda flags: not flags['use_trt'] or flags['trt_max_batch_size'], + 'trt_max_batch_size must be set if use_trt is true') + + +flags.register_multi_flags_validator( + ['use_bt', 'use_tpu'], + lambda flags: flags['use_tpu'] if flags['use_bt'] else True, + '`use_bt` flag only valid with `use_tpu` as well') + +@flags.multi_flags_validator( + ['num_examples', 'steps_to_train', 'filter_amount'], + '`num_examples` requires `steps_to_train==0` and `filter_amount==1.0`') +def _example_flags_validator(flags_dict): + if not flags_dict['num_examples']: + return True + return not flags_dict['steps_to_train'] and flags_dict['filter_amount'] == 1.0 + +@flags.multi_flags_validator( + ['use_bt', 'cbt_project', 'cbt_instance', 'cbt_table'], + message='Cloud Bigtable configuration flags not correct') +def _bt_checker(flags_dict): + if not flags_dict['use_bt']: + return True + return (flags_dict['cbt_project'] + and flags_dict['cbt_instance'] + and flags_dict['cbt_table']) + + +# From dual_net.py +flags.declare_key_flag('work_dir') +flags.declare_key_flag('train_batch_size') +flags.declare_key_flag('num_tpu_cores') +flags.declare_key_flag('use_tpu') +flags.declare_key_flag('dynamic_input') + +FLAGS = flags.FLAGS + + +class EchoStepCounterHook(tf.train.StepCounterHook): + """A hook that logs steps per second.""" + + def _log_and_record(self, elapsed_steps, elapsed_time, global_step): + s_per_sec = elapsed_steps / elapsed_time + logging.info("{}: {:.3f} steps per second".format(global_step, s_per_sec)) + super()._log_and_record(elapsed_steps, elapsed_time, global_step) + + +def compute_update_ratio(weight_tensors, before_weights, after_weights): + """Compute the ratio of gradient norm to weight norm.""" + deltas = [after - before for after, + before in zip(after_weights, before_weights)] + delta_norms = [np.linalg.norm(d.ravel()) for d in deltas] + weight_norms = [np.linalg.norm(w.ravel()) for w in before_weights] + ratios = [d / w for d, w in zip(delta_norms, weight_norms)] + all_summaries = [ + tf.Summary.Value(tag='update_ratios/' + + tensor.name, simple_value=ratio) + for tensor, ratio in zip(weight_tensors, ratios)] + return tf.Summary(value=all_summaries) + + +class UpdateRatioSessionHook(tf.train.SessionRunHook): + """A hook that computes ||grad|| / ||weights|| (using frobenius norm).""" + + def __init__(self, output_dir, every_n_steps=1000): + self.output_dir = output_dir + self.every_n_steps = every_n_steps + self.before_weights = None + self.file_writer = None + self.weight_tensors = None + self.global_step = None + + def begin(self): + """Called once before using the session""" + # These calls only works because the SessionRunHook api guarantees this + # will get called within a graph context containing our model graph. + + self.file_writer = tf.summary.FileWriterCache.get(self.output_dir) + self.weight_tensors = tf.trainable_variables() + self.global_step = tf.train.get_or_create_global_step() + + def before_run(self, run_context): + """Called before each call to run().""" + global_step = run_context.session.run(self.global_step) + if global_step % self.every_n_steps == 0: + self.before_weights = run_context.session.run(self.weight_tensors) + + def after_run(self, run_context, unused_run_values): + """Called after each call to run().""" + global_step = run_context.session.run(self.global_step) + if self.before_weights is not None: + after_weights = run_context.session.run(self.weight_tensors) + weight_update_summaries = compute_update_ratio(self.weight_tensors, self.before_weights, after_weights) + self.file_writer.add_summary(weight_update_summaries, global_step) + self.before_weights = None + + +def train(*tf_records: "Records to train on"): + """Train on examples.""" + tf.logging.set_verbosity(tf.logging.INFO) + estimator = dual_net_rt.get_estimator() + + effective_batch_size = FLAGS.train_batch_size + if FLAGS.use_tpu: + effective_batch_size *= FLAGS.num_tpu_cores + + if FLAGS.use_tpu: + if FLAGS.use_bt: + def _input_fn(params): + games = bigtable_input.GameQueue( + FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) + games_nr = bigtable_input.GameQueue( + FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') + return preprocessing.get_tpu_bt_input_tensors( + games, + games_nr, + params['batch_size'], + params['input_layout'], + number_of_games=FLAGS.window_size, + random_rotation=True) + else: + def _input_fn(params): + return preprocessing.get_tpu_input_tensors( + params['batch_size'], + params['input_layout'], + tf_records, + filter_amount=FLAGS.filter_amount, + shuffle_examples=FLAGS.shuffle_examples, + shuffle_buffer_size=FLAGS.shuffle_buffer_size, + random_rotation=True) + # Hooks are broken with TPUestimator at the moment. + hooks = [] + else: + def _input_fn(): + return preprocessing.get_input_tensors( + FLAGS.train_batch_size, + FLAGS.input_layout, + tf_records, + filter_amount=FLAGS.filter_amount, + shuffle_examples=FLAGS.shuffle_examples, + shuffle_buffer_size=FLAGS.shuffle_buffer_size, + random_rotation=True) + + hooks = [UpdateRatioSessionHook(FLAGS.work_dir), + EchoStepCounterHook(output_dir=FLAGS.work_dir)] + + steps = FLAGS.steps_to_train + if not steps and FLAGS.num_examples: + batch_size = FLAGS.train_batch_size + if FLAGS.use_tpu: + batch_size *= FLAGS.num_tpu_cores + steps = math.floor(FLAGS.num_examples / batch_size) + + logging.info("Training, steps = %s, batch = %s -> %s examples", + steps or '?', effective_batch_size, + (steps * effective_batch_size) if steps else '?') + + if FLAGS.use_bt: + games = bigtable_input.GameQueue( + FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) + if not games.read_wait_cell(): + games.require_fresh_games(20000) + latest_game = games.latest_game_number + index_from = max(latest_game, games.read_wait_cell()) + print("== Last game before training:", latest_game, flush=True) + print("== Wait cell:", games.read_wait_cell(), flush=True) + + try: + estimator.train(_input_fn, steps=steps, hooks=npu_hooks_append(hooks_list=hooks)) + if FLAGS.use_bt: + bigtable_input.set_fresh_watermark(games, index_from, + FLAGS.window_size) + except: + if FLAGS.use_bt: + games.require_fresh_games(0) + raise + + +def main(argv): + """Train on examples and export the updated model weights.""" + ################## Solve Argument List Too long: start ################## + # tf_records = argv[1:] + tf_records = [] + for presentdir, dirnames, filenames in os.walk(FLAGS.training_data_path): + for filename in filenames: + # files with path + file_with_path = os.path.join(presentdir, filename) + tf_records.append(file_with_path) + ################## Solve Argument List Too long: end ################## + + logging.info("Training on %s records: %s to %s", + len(tf_records), tf_records[0], tf_records[-1]) + with utils.logged_timer("Training"): + train(*tf_records) + if FLAGS.export_path: + dual_net_rt.export_model(FLAGS.export_path) + if FLAGS.freeze: + if FLAGS.use_tpu: + dual_net_rt.freeze_graph_tpu(FLAGS.export_path) + else: + dual_net_rt.freeze_graph(FLAGS.export_path, FLAGS.use_trt, + FLAGS.trt_max_batch_size, FLAGS.trt_precision) + + +if __name__ == "__main__": + app.run(main) diff --git a/TensorFlow/built-in/cv/image_segmentation/2Dattentionunet_ID0120_for_TensorFlow/test/train_RT2_performance_1p.sh b/TensorFlow/built-in/cv/image_segmentation/2Dattentionunet_ID0120_for_TensorFlow/test/train_RT2_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..2b733e78a08c3eaea7a3cf120fb9fa0202bc26a7 --- /dev/null +++ b/TensorFlow/built-in/cv/image_segmentation/2Dattentionunet_ID0120_for_TensorFlow/test/train_RT2_performance_1p.sh @@ -0,0 +1,197 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +export ENABLE_RUNTIME_V2=1 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="2Dattentionunet_ID0120_for_TensorFlow" +#训练参数 +model="aunet" +mode="train" +batch_size=32 + +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_fp32_to_fp16" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + + + + + + + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/.. +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + # let a=RANK_ID*12 + # let b=RANK_ID+1 + # let c=b*12-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + nohup python3 mainNPU_v2.py \ + --model=$model \ + --mode=$mode \ + --act=true \ + --crop_height=112 \ + --crop_width=112 \ + --batch_size=${batch_size} \ + --num_epoch=2 \ + --data_dir=${data_path} \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --profiling=${profiling} \ + --profiling_dump_path=${profiling_dump_path} \ + --autotune=${autotune} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a "Final performance FPS" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F" " '{print $4}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +TrainAccuracy=`grep -a "Final accuracy" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F" " '{print $4}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${TrainAccuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p_RT2_perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep -a "Current_Loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/run_cnn_rt.py b/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/run_cnn_rt.py new file mode 100644 index 0000000000000000000000000000000000000000..f92e36462001d6b7421efc0b59c94c356d9f4b6a --- /dev/null +++ b/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/run_cnn_rt.py @@ -0,0 +1,300 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from npu_bridge.npu_init import * +#from npu_bridge import * +import os +import sys +import time +from datetime import timedelta +import pickle +import numpy as np +import tensorflow as tf +from sklearn import metrics +from cnn_model import TCNNConfig, TextCNN +from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('--save_dir', dest='save_dir', default='checkpoints/textcnn') +parser.add_argument('--data_path', dest='data_path', default='./data/cnews', help='path of the dataset') +parser.add_argument('--precision_mode', dest='precision_mode', default='allow_fp32_to_fp16', help='precision mode') +parser.add_argument('--over_dump', dest='over_dump', default='False', help='if or not over detection') +parser.add_argument('--over_dump_path', dest='over_dump_path', default='./overdump', help='over dump path') +parser.add_argument('--data_dump_flag', dest='data_dump_flag', default='False', help='data dump flag') +parser.add_argument('--data_dump_step', dest='data_dump_step', default='10', help='data dump step') +parser.add_argument('--data_dump_path', dest='data_dump_path', default='./datadump', help='data dump path') +parser.add_argument('--profiling', dest='profiling', default='False', help='if or not profiling for performance debug') +parser.add_argument('--profiling_dump_path', dest='profiling_dump_path', default='./profiling', help='profiling path') +parser.add_argument('--autotune', dest='autotune', default='False', help='whether to enable autotune, default is False') +parser.add_argument('--npu_loss_scale', dest='npu_loss_scale', type=int, default=1) +parser.add_argument('--mode', dest='mode', default='train', choices=('train', 'test', 'train_and_eval')) +parser.add_argument('--batch_size', dest='batch_size', type=int, default=64) +parser.add_argument('--learning_rate', dest='learning_rate', type=float, default=0.001) +parser.add_argument('--num_epochs', dest='num_epochs', type=int, default=10) +args = parser.parse_args() + +base_dir = args.data_path +train_dir = os.path.join(base_dir, 'cnews.train.txt') +test_dir = os.path.join(base_dir, 'cnews.test.txt') +val_dir = os.path.join(base_dir, 'cnews.val.txt') +vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt') +save_dir = args.save_dir +save_path = os.path.join(save_dir, 'best_validation') + +def get_time_dif(start_time): + '获取已使用时间' + end_time = time.time() + time_dif = (end_time - start_time) + return timedelta(seconds=int(round(time_dif))), time_dif + +def feed_data(x_batch, y_batch, keep_prob): + feed_dict = { + model.input_x: x_batch, + model.input_y: y_batch, + model.keep_prob: keep_prob + } + return feed_dict + + +def evaluate(sess, x,y): + """评估在某一数据上的准确率和损失""" + total_loss = 0.0 + total_acc = 0.0 + data_len = len(x) + batch_train = batch_iter_(x, y,256) + for x_batch, y_batch in batch_train: + batch_len = len(x_batch) + feed_dict = feed_data(x_batch, y_batch, 1.0) + (loss, acc) = sess.run([model.loss, model.acc], feed_dict=feed_dict) + total_loss += (loss * batch_len) + total_acc += (acc * batch_len) + return ((total_loss / data_len), (total_acc / data_len)) +class data_load(object): + def __init__(self, sess,x,y,is_train=True): + + with tf.device('/cpu:0'): + self.x = x + self.y = y + self.x_ = tf.placeholder(self.x.dtype, self.x.shape) + self.y_ = tf.placeholder(self.y.dtype, self.y.shape) + self.sess = sess + dataset = tf.data.Dataset.from_tensor_slices((self.x_, self.y_)) + + if is_train: + dataset = dataset.shuffle(len(self.x)) + dataset = dataset.repeat() + dataset = dataset.batch(len(self.x)) + else: + dataset = dataset.batch(len(self.x)) + + dataset = dataset.prefetch(2) + self.iterator = dataset.make_initializable_iterator() + self.next = self.iterator.get_next() + self.sess.run(self.iterator.initializer, feed_dict={self.x_: self.x,self.y_: self.y}) + + def replay(self): + self.sess.run(self.iterator.initializer, feed_dict={self.x_: self.x,self.y_: self.y}) + + +def batch_iter_(x, y, batch_size=64): + data_len = len(x) + + num_batch = int((data_len - 1) / batch_size) + 1 + for i in range(num_batch): + start_id = i * batch_size + end_id = min((i + 1) * batch_size, data_len) + yield x[start_id:end_id], y[start_id:end_id] +def train(): + print('Configuring TensorBoard and Saver...') + tensorboard_dir = 'tensorboard/textcnn' + if (not os.path.exists(tensorboard_dir)): + os.makedirs(tensorboard_dir) + tf.summary.scalar('loss', model.loss) + tf.summary.scalar('accuracy', model.acc) + merged_summary = tf.summary.merge_all() + writer = tf.summary.FileWriter(tensorboard_dir) + saver = tf.train.Saver() + if (not os.path.exists(save_dir)): + os.makedirs(save_dir) + print('Loading training and validation data...') + start_time = time.time() + (x_train, y_train) = process_file(train_dir, word_to_id, cat_to_id, config.seq_length) + (x_val, y_val) = process_file(val_dir, word_to_id, cat_to_id, config.seq_length) + time_dif = get_time_dif(start_time) + print('Time usage:', time_dif) + + ############################ modify for run on npu ############################### + from npu_bridge.estimator import npu_ops + from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig + sess_config = tf.ConfigProto() + custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True # 必须显示开启,在昇腾AI处理器执行训练 + sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显示关闭remap + #custom_op.parameter_map["dynamic_input"].b = True + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + #custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile") + #custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes(args.precision_mode) + if args.data_dump_flag.strip() == "True": + custom_op.parameter_map["enable_dump"].b = True + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(args.data_dump_path) + custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes(args.data_dump_step) + custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all") + if args.over_dump.strip() == "True": + # dump_path:dump数据存放路径,该参数指定的目录需要在启动训练的环境上(容器或Host侧)提前创建且确保安装时配置的运行用户具有读写权限 + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(args.over_dump_path) + # enable_dump_debug:是否开启溢出检测功能 + custom_op.parameter_map["enable_dump_debug"].b = True + # dump_debug_mode:溢出检测模式,取值:all/aicore_overflow/atomic_overflow + custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + if args.profiling.strip() == "True": + custom_op.parameter_map["profiling_mode"].b = False + profilingvalue = ( + '{"output":"%s","training_trace":"on","task_trace":"on","aicpu":"on","fp_point":"","bp_point":""}' % ( + args.profiling_dump_path)) + custom_op.parameter_map["profiling_options"].s = tf.compat.as_bytes(profilingvalue) + ############################ modify for run on npu ############################### + print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC Finish") + session = tf.Session(config=sess_config) + session.run(tf.global_variables_initializer()) + writer.add_graph(session.graph) + train_len = len(x_train) + val_len = len(x_val) + train_data = data_load(session,x_train,y_train) + val = data_load(session,x_val,y_val,False) + x_v, y_v = session.run(val.next) + tf.io.write_graph(session.graph_def, 'checkpoints', 'train.pbtxt') + print('Training and evaluating...') + start_time = time.time() + data_time = 0 + total_batch = 0 + best_acc_val = 0.0 + last_improved = 0 + require_improvement = 10000 + total_feed = 0 + total_summary = 0 + total_val = 0 + total_save = 0 + total_train = 0 + flag = False + for epoch in range(config.num_epochs): + print('Epoch:', (epoch + 1)) + x, y = session.run(train_data.next) + batch_train = batch_iter_(x, y, config.batch_size) + for (x_batch, y_batch) in batch_train: + feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) + #if total_batch % config.save_per_batch == 0: + # 每多少轮次将训练结果写入tensorboard scalar + #s = session.run(merged_summary, feed_dict=feed_dict) + #writer.add_summary(s, total_batch) + if ((total_batch % config.print_per_batch) == 0): + feed_dict[model.keep_prob] = 1.0 + (loss_train, acc_train) = session.run([model.loss, model.acc], feed_dict=feed_dict) + (loss_val, acc_val) = evaluate(session, x_v, y_v) + if (acc_val > best_acc_val): + best_acc_val = acc_val + last_improved = total_batch + saver.save(sess=session, save_path=save_path) + improved_str = '*' + else: + improved_str = '' + time_dif, time_sec = get_time_dif(start_time) + msg = ('Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6} ({7})') + print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str, time_sec)) + feed_dict[model.keep_prob] = config.dropout_keep_prob + session.run(model.optim, feed_dict=feed_dict) + #time_dif = get_time_dif(start_time) + #print("step:%d, time:%s"%(total_batch, time_dif)) + total_batch += 1 + if ((total_batch - last_improved) > require_improvement): + # 验证集正确率长期不提升,提前结束训练 + print('No optimization for a long time, auto-stopping...') + flag = True + break # 跳出循环 + if flag: + break + +def test(): + print('Loading test data...') + + x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length) + sess_config = tf.ConfigProto() + custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显示关闭remap + #custom_op.parameter_map["dynamic_input"].b = True + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + #custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile") + session = tf.Session(config=sess_config) + session.run(tf.global_variables_initializer()) + saver = tf.train.Saver() + saver.restore(sess=session, save_path=save_path) + start_time = time.time() + print('Testing...') + (loss_test, acc_test) = evaluate(session, x_test, y_test) + msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' + print(msg.format(loss_test, acc_test)) + batch_size = 256 + data_len = len(x_test) + num_batch = (int(((data_len - 1) / batch_size)) + 1) + y_test_cls = np.argmax(y_test, 1) + y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) + for i in range(num_batch): + start_id = (i * batch_size) + end_id = min(((i + 1) * batch_size), data_len) + feed_dict = {model.input_x: x_test[start_id:end_id], model.keep_prob: 1.0} + y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict) + print('Precision, Recall and F1-Score...') + print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) + print('Confusion Matrix...') + cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) + print(cm) + time_dif = get_time_dif(start_time) + print('Time usage:', time_dif) +if (__name__ == '__main__'): + print('Configuring CNN model...') + config = TCNNConfig() + config.learning_rate = args.learning_rate + config.batch_size = args.batch_size + config.num_epochs = args.num_epochs + config.npu_loss_scale = args.npu_loss_scale + if (not os.path.exists(vocab_dir)): + build_vocab(train_dir, vocab_dir, config.vocab_size) + (categories, cat_to_id) = read_category() + (words, word_to_id) = read_vocab(vocab_dir) + config.vocab_size = len(words) + model = TextCNN(config) + if (args.mode == 'train'): + train() + elif (args.mode == 'test'): + test() + else: + train() + test() \ No newline at end of file diff --git a/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/test/train_RT2_performance_1p.sh b/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/test/train_RT2_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..a4e4401a59d591557ec88f7b1467067ab20dd110 --- /dev/null +++ b/TensorFlow/built-in/nlp/Textcnn_ID0123_For_Tensorflow/test/train_RT2_performance_1p.sh @@ -0,0 +1,180 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +export ENABLE_RUNTIME_V2=1 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Textcnn_ID0123_For_Tensorflow" +#训练epoch +train_epochs=10 +#训练batch_size +batch_size=512 +#学习率 +learning_rate=0.001 +#训练模式 +mode="train" +npu_loss_scale=1 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_fp32_to_fp16" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --mode* ]];then + mode=`echo ${para#*=}` + elif [[ $para == --npu_loss_scale* ]];then + npu_loss_scale=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/.. +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup python3 run_cnn_rt.py \ + --mode=${mode} \ + --data_path=${data_path} \ + --num_epochs=${train_epochs} \ + --save_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ + --learning_rate=${learning_rate} \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --batch_size=${batch_size} \ + --profiling=${profiling} \ + --npu_loss_scale=${npu_loss_scale} \ + --profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +time=(`grep -r "Time: " $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F '(' '{print $NF}' | cut -d ')' -f 1`) +i=${#time[*]} +train_time=`echo "${time[i-1]} ${time[1]} $i"|awk '{print ($1-$2)*10/($3-2)}'` +FPS=`echo "$batch_size $train_time"|awk '{print $1*1000/$2}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy="" +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p_RT2_perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=$train_time + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "Train Loss:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $5}'|cut -d ',' -f 1 >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/contrib/cv/senet_ID0145_for_TensorFlow/run_npu_rt.py b/TensorFlow/contrib/cv/senet_ID0145_for_TensorFlow/run_npu_rt.py new file mode 100644 index 0000000000000000000000000000000000000000..2cfcc594807c7add49ac396c35e4b536a0602190 --- /dev/null +++ b/TensorFlow/contrib/cv/senet_ID0145_for_TensorFlow/run_npu_rt.py @@ -0,0 +1,213 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description='MAIN') + parser.add_argument('--total_epochs',type=int,default=160,help='total_epochs') + parser.add_argument('--test_iteration',type=int,default=10,help='test_iteration') + parser.add_argument('--iteration',type=int,default=391,help='iteration') + parser.add_argument('--data_path',type=str,help='data_path') + args = parser.parse_args() + return args + +args = parse_args() + +weight_decay = 0.0001 +momentum = 0.9 +init_learning_rate = 0.01 +batch_size = 128 +iteration = args.iteration +# 128 * 391 ~ 50,000 +total_epochs = args.total_epochs +test_iteration = args.test_iteration +data_dir = args.data_path + + + +if __name__ == '__main__': + def Evaluate(sess): + test_acc = 0.0 + test_loss = 0.0 + test_pre_index = 0 + add = 1000 + + for it in range(test_iteration): + test_batch_x = test_x[test_pre_index: test_pre_index + add] + test_batch_y = test_y[test_pre_index: test_pre_index + add] + test_pre_index = test_pre_index + add + + test_feed_dict = { + x: test_batch_x, + label: test_batch_y, + learning_rate: epoch_learning_rate, + training_flag: False + } + + loss_, acc_ = sess.run([cost, accuracy], feed_dict=test_feed_dict) + + test_loss += loss_ + test_acc += acc_ + + test_loss /= test_iteration # average loss + test_acc /= test_iteration # average accuracy + + summary = tf.Summary(value=[tf.Summary.Value(tag='test_loss', simple_value=test_loss), + tf.Summary.Value(tag='test_accuracy', simple_value=test_acc)]) + + return test_acc, test_loss, summary + import os + from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig + from seresnetv2 import seresnet_v2 + from cifar10 import * + import tensorflow as tf + from npu_bridge.npu_init import * + config = tf.ConfigProto() + custom_op = config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + #custom_op.parameter_map["dynamic_input"].b = True + #custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile") + custom_op.parameter_map["use_off_line"].b = True # 必须显式开启,在昇腾AI处理器执行训练 + config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭remap + config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + #溢出检测 + #custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("/home/HwHiAiUser/output") + #custom_op.parameter_map["enable_dump_debug"].b = True + #custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + + train_x, train_y, test_x, test_y = prepare_data() + train_x, test_x = color_preprocessing(train_x, test_x) + + # image_size = 32, img_channels = 3, class_num = 10 in cifar10 + x = tf.compat.v1.placeholder(tf.float32, shape=[None, image_size, image_size, img_channels]) + label = tf.compat.v1.placeholder(tf.float32, shape=[None, class_num]) + training_flag = tf.placeholder(tf.bool) + learning_rate = tf.placeholder(tf.float32, name='learning_rate') + + #模型 + logits = seresnet_v2(x, 110) + + + cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=logits)) + l2_loss = tf.add_n([tf.nn.l2_loss(var) for var in tf.trainable_variables()]) + + + #loss—scale + loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) + opt_tmp = npu_tf_optimizer( + tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True)) + optimizer = NPULossScaleOptimizer(opt_tmp, loss_scale_manager) + + #optimizer = npu_tf_optimizer( + # tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True)) + + train = optimizer.minimize(cost + l2_loss * weight_decay) + correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(label, 1)) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + saver = tf.train.Saver(tf.global_variables()) + + + + + with tf.Session(config=config) as sess: + ckpt = tf.train.get_checkpoint_state('./model') + if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): + saver.restore(sess, ckpt.model_checkpoint_path) + else: + sess.run(tf.global_variables_initializer()) + + summary_writer = tf.summary.FileWriter('./logs', sess.graph) + + epoch_learning_rate = init_learning_rate + for epoch in range(1, total_epochs + 1): + if epoch == 2: + epoch_learning_rate = 0.1 + + if epoch == 80: + epoch_learning_rate = 0.01 + + if epoch == 120: + epoch_learning_rate = 0.001 + + + + pre_index = 0 + train_acc = 0.0 + train_loss = 0.0 + + for step in range(1, iteration + 1): + start = time.time() + if pre_index + batch_size < 50000: + batch_x = train_x[pre_index: pre_index + batch_size] + batch_y = train_y[pre_index: pre_index + batch_size] + else: + batch_x = train_x[pre_index:] + batch_y = train_y[pre_index:] + + batch_x = data_augmentation(batch_x) + + train_feed_dict = { + x: batch_x, + label: batch_y, + learning_rate: epoch_learning_rate, + training_flag: True + } + + _, batch_loss = sess.run([train, cost], feed_dict=train_feed_dict) + batch_acc = accuracy.eval(feed_dict=train_feed_dict) + step_time = time.time() - start + train_loss += batch_loss + train_acc += batch_acc + pre_index += batch_size + print(step_time) + train_loss /= iteration # average loss + train_acc /= iteration # average accuracy + + train_summary = tf.Summary(value=[tf.Summary.Value(tag='train_loss', simple_value=train_loss), + tf.Summary.Value(tag='train_accuracy', simple_value=train_acc)]) + + #test_acc, test_loss, test_summary = Evaluate(sess) + + summary_writer.add_summary(summary=train_summary, global_step=epoch) + #summary_writer.add_summary(summary=test_summary, global_step=epoch) + summary_writer.flush() + + line = "epoch: %d/%d, train_loss: %.4f, train_acc: %.4f" % ( + epoch, total_epochs, train_loss, train_acc) + print(line) + + with open('logs.txt', 'a') as f: + f.write(line) + + saver.save(sess=sess, save_path='model/senet110.ckpt') + + + diff --git a/TensorFlow/contrib/cv/senet_ID0145_for_TensorFlow/test/train_RT2_performance_1p.sh b/TensorFlow/contrib/cv/senet_ID0145_for_TensorFlow/test/train_RT2_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..d9b49b4d682f0d3a81bc9fe44a1d616d7fea5274 --- /dev/null +++ b/TensorFlow/contrib/cv/senet_ID0145_for_TensorFlow/test/train_RT2_performance_1p.sh @@ -0,0 +1,112 @@ +#!/bin/bash + + +export RANK_SIZE=1 +export JOB_ID=10087 +export RANK_ID_START=0 +export ENABLE_RUNTIME_V2=1 + +cur_path=`pwd` +data_path='' +ckpt_path='' +Network='senet_ID0145_for_TensorFlow' +batch_size=128 +total_epochs=1 +test_iteration=1 +iteration=10 +# train_performance_1p.sh perf +# train_full_1p.sh acc +CaseName="${Network}_bs${batch_size}_${RANK_SIZE}p_RT2_perf" + + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + echo "${data_path}" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + echo "${ckpt_path}" + elif [[ $para == --total_epochs* ]];then + total_epochs=`echo ${para#*=}` + echo "${total_epochs}" + elif [[ $para == --test_iteration* ]];then + test_iteration=`echo ${para#*=}` + echo "${test_iteration}" + fi +done +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +cd $cur_path/../ +# START +start_time=$(date +%s) +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt + else + mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt + fi + nohup python3 run_npu_rt.py \ + --data_path=${data_path}/cifar-10-batches-py \ + --iteration=${iteration} \ + --total_epochs=${total_epochs} \ + --test_iteration=${test_iteration} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + # wait + # nohup python3 eval.py \ + # --data_path=${data_path}/cifar-10-batches-py >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +end_time=$(date +%s) +e2e_time=$(( ${end_time} - ${start_time} )) + + +echo "------------------ Final result ------------------" +BatchSize=${batch_size} +DeviceType=`uname -m` +# getFPS +sec_per_step=`grep '^0.' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tail -n 1` +FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${sec_per_step}'}'` +ActualFPS=${FPS} +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'/'${FPS}'}'` +# getAcc +# train_accuracy=`grep 'test_acc' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F'test_acc: ' 'END{print $2}'` +train_accuracy='None' +# getLoss +grep train_loss ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F'train_loss: ' '{print $2}' | awk -F',' '{print $1}' > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt +ActualLoss=`awk 'END {print}' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` +echo "Final Performance images/sec : ${FPS}" +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : ${e2e_time}" + + +echo "Network = ${Network}" > ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log diff --git a/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_performance_1p.sh b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..73abdafd3b9db6f989b7afd29f682d93f6d25112 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_performance_1p.sh @@ -0,0 +1,163 @@ +#!/bin/bash + +export ENABLE_RUNTIME_V2=1 +cur_path=`pwd`/../ +#失败用例打屏 +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +# export HYBRID_PROFILING_LEVEL=1 +#基础参数,需要模型审视修改 +#Batch Size +batch_size=128 +#网络名称,同目录名称 +Network="Swin-Transformer_ID2374_for_TensorFlow2.X" +#Device数量,单卡默认为1 +RankSize=1 +#训练epoch,可选 +train_epochs=3 +#训练step +train_steps= +#学习率 +learning_rate=0.01 + +############维测参数############## +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +if [[ $over_dump == True ]];then + over_dump_path=$cur_path/test/overflow_dump #此处cur_path为代码根目录 + mkdir -p ${over_dump_path} +fi +data_dump_flag=False +data_dump_step="10" +profiling=False +use_mixlist=False +mixlist_file="./configs/ops_info.json" +fusion_off_flag=False +fusion_off_file="./configs/fusion_switch.cfg" +############维测参数############## + +#参数配置 +data_path="" + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --use_mixlist* ]];then + use_mixlist=`echo ${para#*=}` + elif [[ $para == --mixlist_file* ]];then + mixlist_file=`echo ${para#*=}` + elif [[ $para == --fusion_off_flag* ]];then + fusion_off_flag=`echo ${para#*=}` + elif [[ $para == --fusion_off_file* ]];then + fusion_off_file=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +##############执行训练########## +cd $cur_path +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +#拷贝并修改数据集txt文件 +cp -r ${data_path}/cifar-100-python /root/.keras/datasets/ + +start=$(date +%s) +nohup python3 swin_transformers.py --epochs=${train_epochs} \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --profiling=${profiling} \ + --use_mixlist=${use_mixlist} \ + --fusion_off_flag=${fusion_off_flag} \ + --mixlist_file=${mixlist_file} \ + --fusion_off_file=${fusion_off_file} \ + --profiling_dump_path=${profiling_dump_path}} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2etime=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2etime" + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +TrainingTime=`grep "352/352" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '352/352' '{print $2}'|grep -v 'ETA'|grep 'loss:'|awk 'END {print $4}'|cut -d 'm' -f -1` + + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p_RT2_perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=`awk 'BEGIN{printf "%.2f\n", '1000'*'${batch_size}'/'${TrainingTime}'}'` + +#获取模型精度 +train_accuracy=`grep "352/352" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '352/352' '{print $2}'|grep 'loss:'|awk 'END {print $10}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "352/352" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '352/352' '{print $2}'|grep -v 'ETA'|grep 'loss:'|awk '{print $7}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2etime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_performance_8p.sh b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..558a4a602b4eb2e4e5d8f5d9ac8158f77f6a97be --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2374_for_TensorFlow2.X/test/train_RT2_performance_8p.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +export ENABLE_RUNTIME_V2=1 +cur_path=`pwd`/.. +#失败用例打屏 + +#export DUMP_GRAPH_PATH=/home/dump_graph +#export DUMP_GE_GRAPH=2 +#export DUMP_GRAPH_LEVEL=3 + +export ASCEND_GLOBAL_LOG_LEVEL=3 +/usr/local/Ascend/driver/tools/msnpureport -g ERROR -d 0 +/usr/local/Ascend/driver/tools/msnpureport -g ERROR -d 1 +/usr/local/Ascend/driver/tools/msnpureport -g ERROR -d 2 +/usr/local/Ascend/driver/tools/msnpureport -g ERROR -d 3 +/usr/local/Ascend/driver/tools/msnpureport -g ERROR -d 4 +/usr/local/Ascend/driver/tools/msnpureport -g ERROR -d 5 +/usr/local/Ascend/driver/tools/msnpureport -g ERROR -d 6 +/usr/local/Ascend/driver/tools/msnpureport -g ERROR -d 7 + +export RANK_SIZE=8 +export RANK_TABLE_FILE=${cur_path}/test/rank_table_8p.json +export JOB_ID=10087 +RANK_ID_START=0 +#export ASCEND_DEVICE_ID=1 +#export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' +#基础参数,需要模型审视修改 +#Batch Size +batch_size=1024 +#网络名称,同目录名称 +Network="Swin-Transformer_ID2374_for_TensorFlow2.X" +#Device数量,单卡默认为1 +#RankSize=1 +#训练epoch,可选 +train_epochs=5 +#训练step +train_steps= +#学习率 +learning_rate=0.01 + +#参数配置 +data_path="1" + +############维测参数############## +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +if [[ $over_dump == True ]];then + over_dump_path=$cur_path/test/overflow_dump #此处cur_path为代码根目录 + mkdir -p ${over_dump_path} +fi +data_dump_flag=False +data_dump_step="10" +profiling=False +use_mixlist=False +mixlist_file="./configs/ops_info.json" +fusion_off_flag=False +fusion_off_file="./configs/fusion_switch.cfg" +############维测参数############## + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_performance_1p.sh " + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --use_mixlist* ]];then + use_mixlist=`echo ${para#*=}` + elif [[ $para == --mixlist_file* ]];then + mixlist_file=`echo ${para#*=}` + elif [[ $para == --fusion_off_flag* ]];then + fusion_off_flag=`echo ${para#*=}` + elif [[ $para == --fusion_off_file* ]];then + fusion_off_file=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +##############执行训练########## +cd $cur_path + +#拷贝并修改数据集txt文件 +cp -r ${data_path}/cifar-100-python /root/.keras/datasets/ + +start=$(date +%s) +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + + if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} + else + mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} + fi + +# export DUMP_GRAPH_PATH=test/output/${RANK_ID}/dump_graph_${RANK_ID} + + nohup python3 swin_transformers.py --epochs=${train_epochs} --batch_size=${batch_size} \ + --rank_size=${RANK_SIZE} \ + --device_id=${RANK_ID} \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --profiling=${profiling} \ + --use_mixlist=${use_mixlist} \ + --fusion_off_flag=${fusion_off_flag} \ + --mixlist_file=${mixlist_file} \ + --fusion_off_file=${fusion_off_file} \ + --profiling_dump_path=${profiling_dump_path}} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait +end=$(date +%s) +e2etime=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2etime" + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +TrainingTime=`grep "44/44" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '44/44' '{print $2}'|grep -v 'ETA'|grep 'loss:'|awk 'END {print $4}'|cut -d 'm' -f -1` + +###下面字段用于冒烟看护 +BatchSize=${batch_size} +#设备类型,自动获取 +DeviceType=`uname -m` +#用例名称,自动获取 +CaseName=${Network}_bs${BatchSize}_${RankSize}'p_RT2_perf' + +#吞吐量 +ActualFPS=`awk 'BEGIN{printf "%.2f\n", '1000'*'${batch_size}'/'${TrainingTime}'}'` + +#获取模型精度 +train_accuracy=`grep "44/44" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '44/44' '{print $2}'|grep 'loss:'|awk 'END {print $10}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "44/44" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F '44/44' '{print $2}'|grep -v 'ETA'|grep 'loss:'|awk '{print $7}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2etime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log