diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/run_squad.py b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/run_squad.py index c97161572146ed03bf63e757c09cd314fa9fcc1e..34e99463223734a1cc870fbc313e071c0a1698f8 100644 --- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/run_squad.py +++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/run_squad.py @@ -44,6 +44,8 @@ from utils import is_main_process, format_step import dllogger, time from apex.optimizers import npu_fused_bert_adam, NpuFusedBertAdam +RANK = int(os.getenv('RANK')) + # torch._C._jit_set_profiling_mode(False) # torch._C._jit_set_profiling_executor(False) @@ -507,7 +509,7 @@ def get_answers(examples, features, results, args): # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. - if not nbest: + if not nbest: nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0)) total_scores = [] @@ -555,7 +557,7 @@ def get_answer_text(example, feature, pred, args): return final_text def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args): - + _PrelimPrediction = collections.namedtuple( "PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"]) @@ -733,7 +735,7 @@ def _compute_softmax(scores): # from apex.multi_tensor_apply import multi_tensor_applier # class GradientClipper: # """ -# Clips gradient norm of an iterable of parameters. +# Clips gradient norm of an iterable of parameters. # """ # def __init__(self, max_grad_norm): # self.max_norm = max_grad_norm @@ -897,7 +899,7 @@ def main(): help="addr used for distributed training") args = parser.parse_args() - args.fp16 = args.fp16 or args.amp + args.fp16 = args.fp16 or args.amp if args.local_rank == -1 or args.no_cuda: if args.use_npu: @@ -913,7 +915,8 @@ def main(): os.environ['MASTER_PORT'] = '29668' torch.npu.set_device("npu:%d" % args.local_rank) device = torch.device("npu:%d" % args.local_rank) - torch.distributed.init_process_group(backend='hccl', world_size=8, rank=args.local_rank) + print("the RANK is :", RANK) + torch.distributed.init_process_group(backend='hccl', world_size=args.num_npu, rank=RANK) n_npu = 1 else: torch.cuda.set_device(args.local_rank) @@ -928,7 +931,7 @@ def main(): dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)]) else: dllogger.init(backends=[]) - + # print("device: {} n_npu: {}, distributed training: {}, 16-bits training: {}".format( # device, n_npu, bool(args.local_rank != -1), args.fp16)) print("train on device {}, rank {}".format(device, args.local_rank)) @@ -1148,7 +1151,7 @@ def main(): else: loss.backward() - + if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 : # modify learning rate with special warm up for BERT which FusedAdam doesn't do @@ -1280,4 +1283,4 @@ def main(): if __name__ == "__main__": main() - dllogger.flush() + dllogger.flush() \ No newline at end of file diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_8p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_8p.sh index 97d53c33fc3d6626ad1538182c3a6571451a9650..3b555b4232b1b1a109755b3a6e20836f4907c88d 100644 --- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_8p.sh @@ -96,6 +96,7 @@ then do let p_start=0+24*i let p_end=23+24*i + export RANK=${i} if [ -d ${cur_path}/output/${i} ];then rm -rf ${cur_path}/output/${i} mkdir -p ${cur_path}/output/$i @@ -107,6 +108,7 @@ then else for i in $(seq 0 7) do + export RANK=${i} if [ -d ${cur_path}/output/${i} ];then rm -rf ${cur_path}/output/${i} mkdir -p ${cur_path}/output/$i diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..751f6a47637d43d858208d9daeb44f1eea0976ef --- /dev/null +++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,220 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#source ../env_npu.sh + +data_path="" +conf_path="" +server_index="" +fix_node_ip="" +#集合通信参数,不需要修改 + +export RANK_SIZE=16 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase_ID0490_for_PyTorch" +#训练batch_size +batch_size=80 + + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + +rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +cur_1=${1:-"1"} +cur_2=${2:-"2"} +cur_3=${3:-"3"} +cur_4=${4:-"4"} +init_checkpoint=${5:-"`${data_path}/pretrained/bert_base_pretrain.pt`"} +epochs=${6:-"1.0"} +batch_size=${7:-"80"} +learning_rate=${8:-"2e-4"} +precision=${9:-"fp16"} +num_npu=${10:-"16"} +seed=${11:-"1"} +squad_dir=${12:-"`${data_path}/squad/v1.1`"} +vocab_file=${13:-"data/uncased_L-24_H-1024_A-16/vocab.txt"} +OUT_DIR=${14:-"results/SQuAD"} +mode=${15:-"train eval"} +CONFIG_FILE=${16:-"bert_base_config.json"} +max_steps=${17:-"-1"} + +echo "out dir is $OUT_DIR" +mkdir -p $OUT_DIR +if [ ! -d "$OUT_DIR" ]; then + echo "ERROR: non existing $OUT_DIR" + exit 1 +fi + +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16=" --fp16 " +fi + +CMD="python3.7 run_squad.py " +CMD+="--init_checkpoint=${data_path}/pretrained/bert_base_pretrain.pt " +if [ "$mode" = "train" ] ; then + CMD+="--do_train " + CMD+="--train_file=${data_path}/squad/v1.1/train-v1.1.json " + CMD+="--train_batch_size=$batch_size " +elif [ "$mode" = "eval" ] ; then + CMD+="--do_predict " + CMD+="--predict_file=${data_path}/squad/v1.1/dev-v1.1.json " + CMD+="--predict_batch_size=$batch_size " + CMD+="--eval_script=${data_path}/squad/v1.1/evaluate-v1.1.py " + CMD+="--do_eval " +elif [ "$mode" = "prediction" ] ; then + CMD+="--do_predict " + CMD+="--predict_file=${data_path}/squad/v1.1/dev-v1.1.json " + CMD+="--predict_batch_size=$batch_size " +else + CMD+=" --do_train " + CMD+=" --train_file=${data_path}/squad/v1.1/train-v1.1.json " + CMD+=" --train_batch_size=$batch_size " + CMD+="--do_predict " + CMD+="--predict_file=${data_path}/squad/v1.1/dev-v1.1.json " + CMD+="--predict_batch_size=$batch_size " + CMD+="--eval_script=${data_path}/squad/v1.1/evaluate-v1.1.py " + CMD+="--do_eval " +fi + +CMD+=" --do_lower_case " +CMD+=" --bert_model=bert-large-uncased " +CMD+=" --learning_rate=$learning_rate " +CMD+=" --seed=$seed " +CMD+=" --num_train_epochs=$epochs " +CMD+=" --max_seq_length=384 " +CMD+=" --doc_stride=128 " +CMD+=" --output_dir=$OUT_DIR " +CMD+=" --vocab_file=$vocab_file " +CMD+=" --config_file=$CONFIG_FILE " +CMD+=" --max_steps=$max_steps " +CMD+=" $use_fp16" +CMD+=" --use_npu" +CMD+=" --num_npu=$num_npu" +CMD+=" --loss_scale=4096" +CMD+=" --addr=$one_node_ip" + +if [ $(uname -m) = "aarch64" ] +then + for i in $(seq 0 7) + do + let p_start=0+24*i + let p_end=23+24*i + export RANK=`awk 'BEGIN{printf "%.0f\n",'${i}'+'${rank_server}'}'` + if [ -d ${cur_path}/output/${i} ];then + rm -rf ${cur_path}/output/${i} + mkdir -p ${cur_path}/output/$i + else + mkdir -p ${cur_path}/output/$i + fi + taskset -c $p_start-$p_end $CMD --local_rank=$i > ${cur_path}/output/${i}/train_${i}.log 2>&1 & + done +else + for i in $(seq 0 7) + do + if [ -d ${cur_path}/output/${i} ];then + rm -rf ${cur_path}/output/${i} + mkdir -p ${cur_path}/output/$i + else + mkdir -p ${cur_path}/output/$i + fi + $CMD --local_rank=$i > ${cur_path}/output/${i}/train_${i}.log 2>&1 & + done +fi +wait + +ASCEND_DEVICE_ID=0 +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +iter=`grep 'Epoch: ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "iter/s :" '{print $NF}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${iter}'*16*'${batch_size}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +rm -rf ${data_path}/squad/v1.1/train-v1.1.json_bert-large-uncased_384_128_64 \ No newline at end of file diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_8p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_8p.sh index 1e7536648a7f2a03bd1877a3c095cb3617ec51db..d38de0ad260014dfb92fe9b1b48de0894f0a5965 100644 --- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_8p.sh @@ -130,6 +130,7 @@ then do let p_start=0+24*i let p_end=23+24*i + export RANK=${i} if [ -d ${cur_path}/output/${i} ];then rm -rf ${cur_path}/output/${i} mkdir -p ${cur_path}/output/$i @@ -141,6 +142,7 @@ then else for i in $(seq 0 7) do + export RANK=${i} if [ -d ${cur_path}/output/${i} ];then rm -rf ${cur_path}/output/${i} mkdir -p ${cur_path}/output/$i