From 57940bce1b82691cd357b0478d442a9a3735a591 Mon Sep 17 00:00:00 2001 From: huangju1993 Date: Mon, 31 Jul 2023 10:20:13 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Tens?= =?UTF-8?q?orFlow/built-in/cv/image=5Fclassification/ResNet50=5FID0058=5Ff?= =?UTF-8?q?or=5FTensorFlow/test/train=5Ffull=5F1p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_full_1p.sh | 212 ------------------ 1 file changed, 212 deletions(-) delete mode 100644 TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_1p.sh diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_1p.sh deleted file mode 100644 index 1f7776982..000000000 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_1p.sh +++ /dev/null @@ -1,212 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 -#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -export RANK_SIZE=8 -export RANK_TABLE_FILE=${cur_path}/../configs/8p.json -export JOB_ID=10087 -RANK_ID_START=0 - -# 数据集路径,保持为空,不需要修改 -data_path="" - -#设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 - -#基础参数 需要模型审视修改 -#网络名称,同目录名称 -Network="ResNet50_for_TensorFlow" - -export HCCL_CONNECT_TIMEOUT=600 -corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` -export RANK_INDEX=0 -export RANK_ID=0 - -config_file=res50_256bs_8p -max_train_steps=1000 -iterations_per_loop=100 -debug=True -eval=True -#维测参数,precision_mode需要模型审视修改 -precision_mode="must_keep_origin_dtype" - -#维持参数,不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False -autotune=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_full_8p.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is 0 - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --autotune whether to enable autotune, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --hf32 ]];then - hf32=`echo ${para#*=}` - elif [[ $para == --fp32 ]];then - fp32=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --autotune* ]];then - autotune=`echo ${para#*=}` - mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak - mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak - autotune_dump_path=${cur_path}/output/autotune_dump - mkdir -p ${autotune_dump_path}/GA - mkdir -p ${autotune_dump_path}/rl - cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ - cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -if [[ ${fp32} == "--hf32" ]];then - export ENABLE_HF32_EXECUTION=1 -fi - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -#autotune时,先开启autotune执行单P训练,不需要修改 -if [[ $autotune == True ]]; then - train_full_1p.sh --autotune=$autotune --data_path=$data_path - wait - autotune=False -fi - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/.. -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - # export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID - export DEVICE_ID=$ASCEND_DEVICE_ID - DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 )) - export DEVICE_INDEX=$DEVICE_INDEX - - #创建DeviceID输出目录,不需要修改 - if [ -d $cur_path/output/$ASCEND_DEVICE_ID ];then - rm -rf $cur_path/output/$ASCEND_DEVICE_ID - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - fi - - #执行训练脚本,需要模型审视修改 - taskset -c $((ASCEND_DEVICE_ID*${corenum}/8))-$(((ASCEND_DEVICE_ID+1)*${corenum}/8-1)) python3.7 ${cur_path}/../src/mains/res50.py \ - --config_file=$config_file \ - --max_train_steps=$max_train_steps \ - --iterations_per_loop=$iterations_per_loop \ - --debug=$debug \ - --eval=$eval \ - --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ - --over_dump=${over_dump} \ - --over_dump_path=${over_dump_path} \ - --precision_mode ${precision_mode} \ - > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - #--precision_mode=${precision_mode} \ - #--data_dump_flag=${data_dump_flag} \ - #--data_dump_step=${data_dump_step} \ - #--data_dump_path=${data_dump_path} \ - #--profiling=${profiling} \ - #--profiling_dump_path=${profiling_dump_path} \ - #--autotune=${autotune} \ -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -echo "------------------ Final result ------------------" -#单step时长,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改 -#step_sec=`grep TimeHistory $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $6}'` -#echo "Final Performance ms/step : $step_sec" -#计算训练时长,需要模型审视修改 -#step_sec=`echo ${step_sec%.*}` -#e2e_sec=`expr ${train_epochs} \* 1281167 / ${step_sec} ` -#echo "Final Training Duration sec : $e2e_sec" -#训练精度,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改 -li=`cat $cur_path/output/0/train_0.log | wc -l` -num=$(($li - 1)) -train_accuracy=`sed -n '${num}p' $cur_path/output/0/train_0.log | awk '{print $3}'` -echo "Final train_accuracy is ${train_accuracy}" -E2E训练端到端时长,直接计算,不需要修改 -echo "E2E training Duration sec: $e2e_time" - -#训练用例信息,不需要修改 -BatchSize=256 -DeviceType=`uname -m` -if [[ ${fp32} == "--fp32" ]];then - CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'acc' -elif [[ ${hf32} == "--hf32" ]];then - CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'hf32'_'acc' -else - CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' -fi - - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS="NULL" -#单迭代训练时长 -TrainingTime="NULL" - -##获取Loss,通过train_*.log中关键字,需要根据模型审视 -#grep train_loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`sed -n '${num}p' $cur_path/output/0/train_0.log | awk '{print $5}'` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RANK_SIZE = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime= ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From d9682874f586a2cc209c4c066c946b7822c4f8a8 Mon Sep 17 00:00:00 2001 From: huangju1993 Date: Mon, 31 Jul 2023 10:22:08 +0000 Subject: [PATCH 2/3] =?UTF-8?q?1p=E7=B2=BE=E5=BA=A6=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: huangju1993 --- .../test/train_full_1p.sh | 194 ++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_1p.sh diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_1p.sh new file mode 100644 index 000000000..8b8889baf --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_1p.sh @@ -0,0 +1,194 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=99990001 +export RANK_ID=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP_ETP=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="ResNet50_ID0058_for_TensorFlow" +#训练epoch +#train_epochs=1 +#训练batch_size +batch_size=256 +#训练step +#train_steps=56300 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +precision_mode="must_keep_origin_dtype" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --hf32 ]];then + hf32=`echo ${para#*=}` + elif [[ $para == --fp32 ]];then + fp32=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +if [[ ${fp32} == "--hf32" ]];then + export ENABLE_HF32_EXECUTION=1 +fi + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#修改参数 +sed -i "50s|PATH_TO_BE_CONFIGURED|${data_path}|g" $cur_path/../src/configs/res50_256bs_1p_eval.py +sed -i "107s|PATH_TO_BE_CONFIGURED|${cur_path}/output/0/d\_solution/ckpt0|g" $cur_path/../src/configs/res50_256bs_1p_eval.py + +cp data_loader.py $cur_path/../src/data_loader/resnet50/ +#训练开始时间,不需要修改 +start_time=$(date +%s) +cd $cur_path/../ +#进入训练脚本目录,需要模型审视修改 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + nohup python3.7 ${cur_path}/../src/mains/res50.py --config_file=res50_256bs_1p_eval \ + --iterations_per_loop=100 \ + --debug=True \ + --eval=True \ + --precision_mode ${precision_mode} \ + --model_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#参数改回 +sed -i "50s|${data_path}|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_1p_eval.py +sed -i "107s|${cur_path}/output/0/d\_solution/ckpt0|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_256bs_1p_eval.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`cat ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep "FPS: " | awk -F "FPS: " '{print $2}' | awk -F " loss:" '{print $1}' | tail -n +2 | awk '{sum+=$1} END {print sum/NR}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +train_accuracy=`grep -A 2 "top1" ${cur_path}/output/0/train_0.log|tail -1|awk 'END {print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +if [[ ${fp32} == "--fp32" ]];then + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'acc' +elif [[ ${hf32} == "--hf32" ]];then + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'hf32'_'acc' +else + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +fi +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "FPS: " $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From ca4cdcf332d34286e51238ae30bf5eb9bd5f5890 Mon Sep 17 00:00:00 2001 From: huangju1993 Date: Tue, 1 Aug 2023 02:32:21 +0000 Subject: [PATCH 3/3] update TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/README.md. Signed-off-by: huangju1993 --- .../ResNet50_ID0058_for_TensorFlow/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/README.md b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/README.md index 20421f555..6a5052831 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/README.md @@ -161,6 +161,11 @@ bash train_performance_bs32_8p.sh ``` 当前需开启混合精度模式训练方式: +1P training is also similar to the former: +``` +cd /path/to/ResNet50_ID0058_for_TensorFlow/test +train_performance_bs256_hw192_1p.sh --bind_core=1 --precision_mode="allow_mix_precision" +``` 8P training is also similar to the former: ``` cd /path/to/ResNet50_ID0058_for_TensorFlow/test -- Gitee