diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md index 9ff988e59c3e48d2b523da88dbae7d6ba8acaf0b..c31b0b5991fcce3902785743f6a1391472e942ab 100644 --- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md @@ -142,6 +142,8 @@ BERT是一种与训练语言表示的方法,这意味着我们在大型文本 将环境变量配置到test/train_*.sh中 +#### 模型训练 + - 单卡训练 启动单卡训练 @@ -164,6 +166,26 @@ BERT是一种与训练语言表示的方法,这意味着我们在大型文本 bash train_ID0495_Bert-Squad_performance_8p.sh ``` +#### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_ID0495_Bert-Squad_performance_distribute.sh`, 该脚本由`.test/train_ID0495_Bert-Squad_performance_1p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_ID0495_Bert-Squad_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_ID0495_Bert-Squad_performance_distribute.sh --data_path=/npu/traindata" +```

高级参考

diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh new file mode 100644 index 0000000000000000000000000000000000000000..b5c811e345fb8ee2f5fa2ae2656c8304e3170f8f --- /dev/null +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh @@ -0,0 +1,144 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` +parent_path=$(dirname $(pwd)) + +#集合通信参数,不需要修改 +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="Bertsquad_ID0495_for_TensorFlow" +batch_size=32 +epoch=1 + +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +vocab_file=${data_path}/model/vocab.txt +bert_config_file=${data_path}/model/bert_config.json +init_checkpoint=${data_path}/model/bert_model.ckpt +train_file=${data_path}/dataset/train-v1.1_small.json +predict_file=${data_path}/dataset/dev-v1.1.json + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +#创建DeviceID输出目录,不需要修改 +if [ -d $cur_path/output/$ASCEND_DEVICE_ID ];then + rm -rf $cur_path/output/$ASCEND_DEVICE_ID + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +fi + +#执行训练脚本,需要模型审视修改 +nohup python3.7 ${parent_path}/run_squad.py \ + --vocab_file=$vocab_file \ + --bert_config_file=$bert_config_file \ + --init_checkpoint=$init_checkpoint \ + --train_file=$train_file \ + --do_predict=True \ + --do_train=True \ + --predict_file=$predict_file \ + --train_batch_size=${batch_size} \ + --num_train_epochs=${epoch} \ + --num_train_steps=1000 \ + --learning_rate=3e-5 \ + --max_seq_length=384 \ + --doc_stride=128 \ + --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#E2E训练端到端时长,直接计算,不需要修改 +echo "E2E training Duration sec: $e2e_time" + +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +#获取性能数据 +step_per_sec=`grep "global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'global_step/sec:' '{print $2}'|awk 'END {print $1}'` +ActualFPS=`awk 'BEGIN {printf "%.2f\n", '${step_per_sec}' * '${batch_size}' * '${RANK_SIZE}'}'` +TrainingTime=`awk 'BEGIN {printf "%.2f\n", '8000' * '${batch_size}' / '${ActualFPS}'}'` + +ActualLoss=`grep "loss =" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'loss =' '{print $2}'|awk 'END {print $1}'|tr -d ,` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +sed -i -e '/ModuleNotFoundError/d' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md index 4b9bb0dc614ea8ce2ea1dacc22b6248aacc45ce5..40d769044324ba3a6917402f4bf7cf4a08797012 100644 --- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md @@ -151,6 +151,8 @@ python3 ${work_path}/src/utils/create_squad_data.py --train_file=${data_path}/tr ## 模型训练 +#### 模型训练 + - 单击“立即下载”,并选择合适的下载方式下载源码包。 - 开始训练。 @@ -226,6 +228,26 @@ python3 ${work_path}/src/utils/create_squad_data.py --train_file=${data_path}/tr bash train_ID3220_BertLarge-Squad2.0_performance_1p.sh --data_path=/home ``` +#### 分布式插件使能分布式 + +ID0060网络分布式统一训练脚本`./test/train_ID0060_BertBase_performance_distribute.sh`, 该脚本由`./test/train_ID0060_BertBase_performance_8p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_ID0060_BertBase_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_ID0060_BertBase_performance_distribute.sh --data_path=/npu/traindata" +```

高级参考

diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh new file mode 100644 index 0000000000000000000000000000000000000000..63e15339588a1cf44bbbcdf974bd411c2c9e000b --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh @@ -0,0 +1,175 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=99990001 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Bert-base_ID0060_for_TensorFlow" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=128 +#训练step +train_steps=1000 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID} +fi + + # 绑核,不需要的绑核的模型删除,需要模型审视修改 +corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` +let a=$ASCEND_DEVICE_ID*${corenum}/${RANK_SIZE} +let b=$ASCEND_DEVICE_ID+1 +let c=b*${corenum}/${RANK_SIZE}-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3.7 $cur_path/../src/run_pretraining.py --bert_config_file=${cur_path}/../configs/bert_base_config.json \ +--max_seq_length=128 \ +--max_predictions_per_seq=20 \ +--train_batch_size=${batch_size} \ +--learning_rate=1e-4 \ +--num_warmup_steps=0 \ +--num_train_steps=${train_steps} \ +--optimizer_type=adam \ +--manual_fp16=True \ +--use_fp16_cls=True \ +--input_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/training \ +--eval_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/test \ +--npu_bert_debug=False \ +--npu_bert_use_tdt=True \ +--do_train=True \ +--num_accumulation_steps=1 \ +--npu_bert_job_start_file= \ +--iterations_per_loop=100 \ +--save_checkpoints_steps=1000 \ +--npu_bert_clip_by_global_norm=False \ +--distributed=True \ +--npu_bert_tail_optimize=True \ +--npu_bert_loss_scale=0 \ +--output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +ActualFPS=`grep Throughput ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END {print $6}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}' * '${RANK_SIZE}' / '${ActualFPS}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $ActualFPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "tensorflow:loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss = " '{print $2}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md index 00b2d88975283cfc36a86dfb350967e4b37df0e6..8ef66fccc25480bb818550dc442c0121dcbec775 100644 --- a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md @@ -128,6 +128,9 @@ npu_device.global_options().precision_mode = 'allow_mix_precision' 2. 数据集标注文件需要先后使用scripts目录下coco_convert.py及coco_annotation.py生成。标注文件生成后即内含图片路径及box信息,故数据集图片文件不可随意移动位置。 ## 模型训练 + +#### 模型训练 + - 单击“立即下载”,并选择合适的下载方式下载源码包。 - 开始训练。 @@ -168,6 +171,27 @@ npu_device.global_options().precision_mode = 'allow_mix_precision' ├─val2017.txt ``` +#### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` +

迁移学习指导

- 数据集准备。 diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh new file mode 100644 index 0000000000000000000000000000000000000000..0b6bdf0a45799304e7239d521e7682a671bd8f17 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' + +#集合通信参数,不需要修改 +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +anno_converted='/npu/traindata/COCO2017/val2017.txt' +gt_anno_path='/npu/traindata/COCO2017/annotations/instances_val2017.json' + +#屏蔽TF2.4升级到TF2.6图差异带来的性能下降 +export NPU_EXECUTE_OP_BY_ACL=false + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="YOLOv5_ID1719_for_TensorFlow2.X" + +# 训练epoch +stage1_epoch=0 +stage2_epoch=1 + +# 训练batchsize +batch_size=8 + +train_worker_num=8 + +# TF2.X独有,不需要修改 +export NPU_LOOPSIZE=1 + +# 精度模式 +precision_mode='allow_mix_precision' +#维持参数,不需要修改 +over_dump=False +over_dump_path='' +data_dump_flag=False +data_dump_path='' +data_dump_step="1" +profiling=False +autotune=False +perf=20 + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be specified" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +bind_core=1 +#进入训练脚本目录,需要模型审视修改 + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/$ASCEND_DEVICE_ID ];then + rm -rf ${cur_path}/output/$ASCEND_DEVICE_ID + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +fi +cd ${cur_path}/output/$ASCEND_DEVICE_ID/ +#执行训练脚本,需要模型审视修改 +corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` +let a=RANK_ID*${corenum}/8 +let b=RANK_ID+1 +let c=b*${corenum}/8-1 +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +#${bind_core} python3 ../../../train.py --weights='' \ +nohup ${bind_core} python3 ../../../train.py --weights='' \ + --perf=$perf \ + --model=yolov5m \ + --rank=${RANK_ID} \ + --rank_size=${RANK_SIZE} \ + --train_worker_num=${train_worker_num} \ + --data_path=${data_path} \ + --anno_converted=${anno_converted} \ + --gt_anno_path=${gt_anno_path} \ + --batch_size=${batch_size} \ + --precision_mode=${precision_mode} \ + --stage1_epoch=${stage1_epoch} \ + --stage2_epoch=${stage2_epoch} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +#输出性能FPS。需要模型审视修改 +epoch_duration=`grep epoch_duration $cur_path/output/0/train_0.log | awk '{print $2}'` +first_step=`grep duration: $cur_path/output/0/train_0.log |head -1| awk -F "duration:" '{print $2}' |sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",('$perf'+'$train_worker_num'-2)/('$epoch_duration'-'$first_step')*'$batch_size'*8}'` +echo "Final Performance imgs/sec : $FPS" + +#训练精度,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改 +# li=`cat $cur_path/output/0/train_0.log | wc -l` +# num=$(($li - 1)) +# train_accuracy=`sed -n "${num}p" $cur_path/output/0/train_0.log | awk '{print $3}'` +# echo "Final Train Accuracy : ${train_accuracy}" +#E2E训练端到端时长,直接计算,不需要修改 +echo "E2E training Duration sec: $e2e_time" + +#训练用例信息,不需要修改 +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",('$epoch_duration'-'$first_step')/('$perf'+'$train_worker_num'-2)}'` + +##获取Loss,通过train_*.log中关键字,需要根据模型审视 +grep loss $cur_path/output/0/train_0.log|awk '{print $13}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`grep total_loss: $cur_path/output/0/train_0.log | awk 'END{print $13}'` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +# echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + sed -i "/AttributeError/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log +done \ No newline at end of file diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md index b9000ac78e1d5e45179372ad0a2407976d8f9590..e8745d95ab8d17243f465b7c01b55552697ce9e1 100644 --- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md @@ -176,6 +176,9 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out 文件夹路径需要自己创建。 ## 模型训练 + +#### 模型训练 + - 下载训练脚本。 - 检查并修改configs/目录下8卡IP的json配置文件“rank_table_8p.json"。 @@ -261,6 +264,27 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out 4.1 含pack策略的训练脚本(./test/目录下名字带有"_packed"的脚本即为相应包含pack策略的训练脚本) 使用pack策略进行训练时,需使用pack过后的数据集(train、eval)及对应的预训练模型。若无对应tensorflow-v2版本packed预训练模型,可由tensorflow-v1版本进行转换得来。模型转换相关脚本为bert/tf2_encoder_checkpoint_converter.py,详见:脚本和事例代码 - 模型转换脚本 +#### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p_192bs.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 + +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` +

高级参考

## 脚本和事例代码 diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh new file mode 100644 index 0000000000000000000000000000000000000000..ace12437ec9e1ef484d89d3e7b20bcda4d4b9bc3 --- /dev/null +++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh @@ -0,0 +1,213 @@ +#'!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=10087 +RANK_ID_START=0 + +export NPU_ENABLE_PERF=true +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge_ID0634_for_TensorFlow2.X" +#训练batch_size +batch_size=192 +eval_batch_size=16 +#训练step +train_steps=1000 +#训练epoch +train_epochs=`expr 768 / ${batch_size}` +#学习率 +learning_rate=0.000144 + +#TF2.X独有,需要模型审视修改 +export NPU_LOOP_SIZE=100 +export GE_USE_STATIC_MEMORY=1 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p_32bs.sh " + + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,需要模型审视修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +init_ckpt_path=${data_path}/'tf2_ckpt/model.ckpt-28252' #need modify to actual path +train_files_path=${data_path}/'train/*' #need modify to actual path +eval_files_path=${data_path}/'eval/eval.tfrecord' #need modify to actual path + + + +start_time=$(date +%s) +#############执行训练######################### + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +fi + +#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 +cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` +cpustep=`expr $cpucount / 8` +echo "taskset c steps:" $cpustep +let a=$ASCEND_DEVICE_ID*$cpustep +let b=$ASCEND_DEVICE_ID+1 +let c=b*$cpustep-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3 ../bert/run_pretraining.py \ +--all_reduce_alg=nccl \ + --bert_config_file=../configs/bert_config.json \ +--beta_1=0.91063 \ +--beta_2=0.96497 \ +--device_warmup=False \ +--do_eval=True \ +--dtype=fp16 \ +--eval_batch_size=${eval_batch_size} \ +--init_checkpoint=${init_ckpt_path} \ + --train_files=${train_files_path} \ +--eval_files=${eval_files_path} \ +--learning_rate=${learning_rate} \ +--loss_scale=dynamic \ +--max_predictions_per_seq=76 \ +--max_seq_length=512 \ +--model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \ +--num_accumulation_steps=1 \ +--distribution_strategy=one_device \ +--num_gpus=1 \ +--enable_checkpoint_and_summary=True \ + --num_steps_per_epoch=1000 \ +--num_train_epochs=${train_epochs} \ +--optimizer_type=lamb \ +--scale_loss=False \ +--steps_between_eval=100 \ +--steps_per_loop=${NPU_LOOP_SIZE} \ +--stop_steps=200 \ +--train_batch_size=${batch_size} \ +--verbosity=0 \ +--warmup_steps=0 \ +--precision_mode=${precision_mode} \ +--attention_with_dropout_v3=False \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ +--profiling=${profiling} \ +--profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#############结果处理######################### +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'*'${batch_size}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#############冒烟看护######################### +BatchSize=${batch_size} +#设备类型 +DeviceType=`uname -m` +#用例名称 +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +##获取Loss +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 +grep loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print$11}'|grep -v instead|grep -v masked_lm_loss|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +sed -i "/AttributeError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log diff --git a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md index 83e2cd7183b77310c61b19e823dc217a05731cda..4ba84ba7c86e84af0a30d0ef2922e08a5f978c35 100644 --- a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md +++ b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md @@ -163,6 +163,9 @@ npu_device.global_options().precision_mode=FLAGS.precision_mode ## 模型训练 + +#### 模型训练 + - 下载训练脚本。 - 检查scripts/目录下是否有存在8卡IP的json配置文件“rank_table_8p.json"。 @@ -243,7 +246,26 @@ npu_device.global_options().precision_mode=FLAGS.precision_mode train_performance_8p_49152bs_static_noeval.sh --data_path=${Data_Path} +#### 分布式插件使能分布式 + +分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p_32768bs_static_noeval.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改 + +训练前请下载工具并根据说明完成配置 +工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute + + +- 8p训练 +``` +python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +``` + + +- 16p训练 + +``` +python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata" +```

高级参考

diff --git a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh new file mode 100644 index 0000000000000000000000000000000000000000..d759a220560101aac4a1b0ad9fd3c65a529c9d49 --- /dev/null +++ b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export JOB_ID=10087 +RANK_ID_START=0 +export PYTHONPATH=../transformer:$PYTHONPATH + +export NPU_ENABLE_PERF=true +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="Transformer_ID0633_for_TensorFlow2.X" +#训练batch_size +batch_size=32768 +#训练step +train_steps=500 + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p_49152bs.sh " + + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,需要模型审视修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +start_time=$(date +%s) +#############执行训练######################### + +#设置环境变量,不需要修改 +echo "Device ID: $RANK_ID" + +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} +fi + +#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 +cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` +cpustep=`expr $cpucount / 8` +echo "taskset c steps:" $cpustep +let a=$ASCEND_DEVICE_ID*$cpustep +let b=$ASCEND_DEVICE_ID+1 +let c=b*$cpustep-1 + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune +if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" +fi +nohup ${bind_core} python3 ../transformer/official/nlp/transformer/transformer_main.py \ +--data_dir=${data_path} \ +--model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ +--vocab_file=${data_path}/vocab.ende.32768 \ +--param_set=big \ +--train_steps=${train_steps} \ +--static_batch=true \ +--batch_size=${batch_size} \ +--steps_between_evals=100 \ +--max_length=64 \ +--mode=train \ +--decode_batch_size=32 \ +--decode_max_length=97 \ +--padded_decode=False \ +--num_gpus=1 \ +--dtype=fp16 \ +--distribution_strategy='one_device' \ +--enable_time_history=true \ +--log_steps=100 \ +--loss_scale='dynamic' \ +--precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ +--profiling=${profiling} \ +--profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#############结果处理######################### +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $8}'|tail -n +2|awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#############冒烟看护######################### +BatchSize=${batch_size} +#设备类型 +DeviceType=`uname -m` +#用例名称 +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +##获取Loss +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中 +grep 'Train history' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}'| sed 's/\[//g'|sed 's/\]}//g' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log