diff --git a/PyTorch/dev/cv/image_classification/2D_Unet_ID0624_for_PyTorch/test/train_full_16p.sh b/PyTorch/dev/cv/image_classification/2D_Unet_ID0624_for_PyTorch/test/train_full_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..b3cefa070c580e22e843f2252c2ea40aab7bde05 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/2D_Unet_ID0624_for_PyTorch/test/train_full_16p.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +cur_path=`pwd`/../ +#失败用例打屏 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=1 +#网络名称,同目录名称 +Network="2D_Unet_ID0624_for_PyTorch" +#Device数量,单卡默认为1 +RANK_SIZE=16 +#训练epoch,可选 +train_epochs=5 +#训练step +train_steps= +#学习率 +learning_rate=1e-3 +#参数配置 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_accormance_1p.sh " + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + +##############执行训练########## +cd $cur_path +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +export HCCL_WHITELIST_DISABLE=1 +#export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=23456 + + +export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID +sed -i "s|data/imgs/|$data_path/imgs/|g" $cur_path/train.py +sed -i "s|data/masks/|$data_path/masks/|g" $cur_path/train.py +start=$(date +%s) +#nohup python3 train.py -e $train_epochs > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + + +NPUS=($(seq 0 7)) +rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` +#export NPU_WORLD_SIZE=${#NPUS[@]} +rank=0 +for i in ${NPUS[@]} +do + mkdir -p $cur_path/test/output/${i}/ + export NPU_CALCULATE_DEVICE=${i} + export ASCEND_DEVICE_ID=${i} + export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + echo run process ${rank} + nohup python3 train.py -e $train_epochs -l 0.0001 --distributed True > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${i}.log 2>&1 & + let rank++ +done +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + + + +#输出训练精度,需要模型审视修改 +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +sed -i "s|\r|\n|g" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log +TrainingTime=0 +FPS=`grep img/s $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |grep -v 0% | awk -F "," '{print$2}' | awk '{print$1}' | awk -F "i" '{print$1}' | awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` +FPS=$(awk 'BEGIN{print '$FPS'*16}') +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +ActualFPS=${FPS} + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk -F "," '{print$3}' | awk -F "=" '{print$2}' | awk -F "]" '{print$1}'| awk '{if(length !=0)print $0}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + + +#精度值 +train_accuracy=`grep "Validation Dice" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $NF}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'` + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/dev/cv/image_classification/2D_Unet_ID0624_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/2D_Unet_ID0624_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..cff304ed2b1e884cd43873500eb8a05e552d8112 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/2D_Unet_ID0624_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +cur_path=`pwd`/../ +#失败用例打屏 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 + +source /usr/local/Ascend/bin/setenv.bash + +export PATH=/usr/local/hdf5/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/hdf5/lib:$LD_LIBRARY_PATH +export LIBRARY_PATH=/usr/local/hdf5/lib:$LIBRARY_PATH +export CPATH=/usr/local/hdf5/include:$CPATH + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=1 +#网络名称,同目录名称 +Network="2D_Unet_ID0624_for_PyTorch" +#Device数量,单卡默认为1 +RANK_SIZE=16 +#训练epoch,可选 +train_epochs=1 +#训练step +train_steps= +#学习率 +learning_rate=1e-3 +#参数配置 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_performance_1p.sh " + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + +##############执行训练########## +cd $cur_path +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + + +export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID +sed -i "s|data/imgs/|$data_path/imgs/|g" $cur_path/train.py +sed -i "s|data/masks/|$data_path/masks/|g" $cur_path/train.py +#sed -i "s|if global_step == 100: pass|if global_step == 100: break|g" $cur_path/train.py +start=$(date +%s) +#nohup python3 train.py -e $train_epochs > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + + +export HCCL_WHITELIST_DISABLE=1 +export MASTER_PORT=23456 +NPUS=($(seq 0 7)) +rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` +rank=0 +for i in ${NPUS[@]} +do + mkdir -p $cur_path/test/output/${i}/ + export NPU_CALCULATE_DEVICE=${i} + export ASCEND_DEVICE_ID=${i} + export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + echo run process ${rank} + nohup python3 train.py -e $train_epochs --distributed True > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${i}.log 2>&1 & + let rank++ +done +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + + +#sed -i "s|if global_step == 100: break|if global_step == 100: pass|g" $cur_path/train.py + +#输出训练精度,需要模型审视修改 +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +sed -i "s|\r|\n|g" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log +TrainingTime=0 +FPS=`grep img/s $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | grep -v 0% | awk -F "," '{print$2}' | awk '{print$1}' | awk -F "i" '{print$1}' | awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` +FPS=$(awk 'BEGIN{print '$FPS'*16}') + +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +ActualFPS=${FPS} + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk -F "," '{print$3}' | awk -F "=" '{print$2}' | awk -F "]" '{print$1}'| awk '{if(length !=0)print $0}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + + +#精度值 +#train_accuracy=`grep "loss" $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss_2.txt|awk -F " " '{print $8}'|awk 'END {print}'` + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..f1dd25a169a86446ba167dbea7d960409b89a8a0 --- /dev/null +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh @@ -0,0 +1,175 @@ +#!/bin/bash + +cur_path=`pwd` +#集合通信参数,不需要修改 +export RANK_SIZE=16 +#export MASTER_ADDR=localhost +export MASTER_PORT=29688 +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" + +#网络名称,同目录名称,需要模型审视修改 +Network="Transformer_ID0105_for_PyTorch" + +export BMMV2_ENABLE=1 +#训练epoch +train_epochs=30 +#训练batch_size,,需要模型审视修改 +batch_size=128 + + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + + +#创建DeviceID输出目录,不需要修改 +if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then + rm -rf $cur_path/output/$ASCEND_DEVICE_ID + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# + + +# 必要参数替换配置文件 +cd $cur_path/.. +DATA_DIR=./data/dataset/wmt14_en_de_joined_dict/ +MODELDIR="./checkpoints/" +mkdir -p "$MODELDIR" +LOGFILE="$MODELDIR/log" +STAT_FILE="log.txt" + + +start_time=$(date +%s) +NPUS=($(seq 0 7)) +rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` +#export NPU_WORLD_SIZE=${#NPUS[@]} +rank=0 +for i in ${NPUS[@]} +do + export NPU_CALCULATE_DEVICE=${i} + mkdir -p $cur_path/output/${i}/ + export ASCEND_DEVICE_ID=${i} + export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + echo run process ${rank} + + + python3 train_8p_new.py \ + $data_path \ + --arch transformer_wmt_en_de \ + --share-all-embeddings \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.997 \ + --addr ${one_node_ip} \ + --port 29990 \ + --adam-eps "1e-9" \ + --clip-norm 0.0 \ + --lr-scheduler inverse_sqrt \ + --warmup-init-lr 0.0 \ + --warmup-updates 4000 \ + --lr 0.0006 \ + --min-lr 0.0 \ + --dropout 0.1 \ + --weight-decay 0.0 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --max-sentences 128\ + --max-tokens 102400 \ + --max-epoch $train_epochs \ + --seed 1 \ + --save-dir $MODELDIR \ + --stat-file $STAT_FILE\ + --log-interval 1\ + --amp\ + --device-id ${rank}\ + --amp-level O2 > $cur_path/output/${i}/train_${i}.log 2>&1 & + let rank++ +done +wait + + + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -rns "Time" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Time" '{print$2}' |awk -F "(" '{print$1}'|tail -n +5|awk '{sum+=$1} END {print"",16*128*NR/sum}'|sed s/[[:space:]]//g` + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -rns "Validation" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk 'END {print $6}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +#获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'$FPS'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -rns "Time" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Loss" '{print$2}' |awk -F "(" '{print$1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${cur_path}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..09788d343cad407c7cce3649aba9a22186d01edd --- /dev/null +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,176 @@ +#!/bin/bash + +cur_path=`pwd` +#nmon -s3 -c 500 -f -m $cur_path +#集合通信参数,不需要修改 +export RANK_SIZE=16 +#export MASTER_ADDR=localhost +export MASTER_PORT=29688 +export HCCL_WHITELIST_DISABLE=1 +export BMMV2_ENABLE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" + +#网络名称,同目录名称,需要模型审视修改 +Network="Transformer_ID0105_for_PyTorch" + +#训练batch_size,,需要模型审视修改 +batch_size=128 + + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + + +#创建DeviceID输出目录,不需要修改 +if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then + rm -rf $cur_path/output/$ASCEND_DEVICE_ID + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# + + +# 必要参数替换配置文件 +cd $cur_path/.. +DATA_DIR=./data/dataset/wmt14_en_de_joined_dict/ +MODELDIR="./checkpoints/" +mkdir -p "$MODELDIR" +LOGFILE="$MODELDIR/log" +STAT_FILE="log.txt" + +sed -i "s|if i>100:pass|if i>100:break|g" train_8p_new.py +sed -i "s|if m >=2:pass|if m >=2:break|g" train_8p_new.py + +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 +export PTCOPY_ENABLE=1 +export TASK_QUEUE_ENABLE=1 +export DYNAMIC_OP="ADD#MUL" +start_time=$(date +%s) +NPUS=($(seq 0 7)) +rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` +#export NPU_WORLD_SIZE=${#NPUS[@]} +rank=0 +for i in ${NPUS[@]} +do + export NPU_CALCULATE_DEVICE=${i} + mkdir -p $cur_path/output/${i}/ + export ASCEND_DEVICE_ID=${i} + export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + echo run process ${rank} + + + python3 train_8p_new.py \ + $data_path \ + --arch transformer_wmt_en_de \ + --share-all-embeddings \ + --optimizer adam \ + --adam-beta1 0.9 \ + --distributed-world-size ${NPU_WORLD_SIZE} \ + --adam-beta2 0.997 \ + --addr ${one_node_ip} \ + --port 29990 \ + --adam-eps "1e-9" \ + --clip-norm 0.0 \ + --lr-scheduler inverse_sqrt \ + --warmup-init-lr 0.0 \ + --warmup-updates 4000 \ + --lr 0.0006 \ + --min-lr 0.0 \ + --dropout 0.1 \ + --weight-decay 0.0 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --max-sentences 128\ + --max-tokens 102400 \ + --seed 1 \ + --save-dir $MODELDIR \ + --stat-file $STAT_FILE\ + --log-interval 1\ + --amp\ + --device-id ${rank}\ + --amp-level O2 > $cur_path/output/${i}/train_${i}.log 2>&1 & + let rank++ +done +wait +sed -i "s|if i>100:break|if i>100:pass|g" train_8p_new.py +sed -i "s|if m >=2:break|if m >=2:pass|g" train_8p_new.py + + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=` grep -rns "Time" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Time" '{print$2}' |awk -F "(" '{print$1}'|tail -n +5|awk '{sum+=$1} END {print"",16*128*NR/sum}'|sed s/[[:space:]]//g` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#打印,不需要修改 +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +#获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -rns "Time" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Loss" '{print$2}' |awk -F "(" '{print$1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${cur_path}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p_new.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p_new.py index 0179a825cfad84454d00ed41788b4f50e8bb39de..6cfe2702485f2de56e8625f23fbb32876ce906e0 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p_new.py +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p_new.py @@ -111,8 +111,9 @@ def main(): print(args) os.environ['MASTER_ADDR'] = args.addr os.environ['MASTER_PORT'] = args.port + device_id = args.device_id #mp.spawn(main_worker, nprocs=args.distributed_world_size, args=(args.distributed_world_size, args)) - main_worker(pid_idx=RANK, args=args) + main_worker(pid_idx=device_id, args=args) @@ -240,7 +241,7 @@ def train(args, trainer, datasets, epoch_itr): # reset meters DLLogger.flush() trainer.get_throughput_meter().reset() - + for i, sample in enumerate(itr): if i>100:pass if i < num_batches - 1 and (i + 1) % update_freq > 0: @@ -414,4 +415,4 @@ def load_checkpoint(args, trainer, epoch_itr): if __name__ == '__main__': - main() + main() \ No newline at end of file