diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md
index 9ff988e59c3e48d2b523da88dbae7d6ba8acaf0b..c31b0b5991fcce3902785743f6a1391472e942ab 100644
--- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/README.md
@@ -142,6 +142,8 @@ BERT是一种与训练语言表示的方法,这意味着我们在大型文本
将环境变量配置到test/train_*.sh中
+#### 模型训练
+
- 单卡训练
启动单卡训练
@@ -164,6 +166,26 @@ BERT是一种与训练语言表示的方法,这意味着我们在大型文本
bash train_ID0495_Bert-Squad_performance_8p.sh
```
+#### 分布式插件使能分布式
+
+分布式统一训练脚本`./test/train_ID0495_Bert-Squad_performance_distribute.sh`, 该脚本由`.test/train_ID0495_Bert-Squad_performance_1p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改
+
+训练前请下载工具并根据说明完成配置
+
+工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute
+
+
+- 8p训练
+```
+python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_ID0495_Bert-Squad_performance_distribute.sh --data_path=/npu/traindata"
+```
+
+
+- 16p训练
+
+```
+python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_ID0495_Bert-Squad_performance_distribute.sh --data_path=/npu/traindata"
+```
高级参考
diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b5c811e345fb8ee2f5fa2ae2656c8304e3170f8f
--- /dev/null
+++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_performance_distribute.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+#当前路径,不需要修改
+cur_path=`pwd`
+parent_path=$(dirname $(pwd))
+
+#集合通信参数,不需要修改
+#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下
+export JOB_ID=10087
+RANK_ID_START=0
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#基础参数 需要模型审视修改
+#网络名称,同目录名称
+Network="Bertsquad_ID0495_for_TensorFlow"
+batch_size=32
+epoch=1
+
+#维持参数,不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+# 帮助信息,不需要修改
+if [[ $1 == --help || $1 == -h ]];then
+ echo"usage:./train_full_8p.sh "
+ echo " "
+ echo "parameter explain:
+ --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+ --over_dump if or not over detection, default is False
+ --data_dump_flag data dump flag, default is 0
+ --data_dump_step data dump step, default is 10
+ --profiling if or not profiling for performance debug, default is False
+ --autotune whether to enable autotune, default is False
+ --data_path source data of training
+ -h/--help show help message
+ "
+ exit 1
+fi
+
+#参数校验,不需要修改
+for para in $*
+do
+ if [[ $para == --precision_mode* ]];then
+ precision_mode=`echo ${para#*=}`
+ elif [[ $para == --over_dump* ]];then
+ over_dump=`echo ${para#*=}`
+ over_dump_path=${cur_path}/output/overflow_dump
+ mkdir -p ${over_dump_path}
+ elif [[ $para == --data_dump_flag* ]];then
+ data_dump_flag=`echo ${para#*=}`
+ data_dump_path=${cur_path}/output/data_dump
+ mkdir -p ${data_dump_path}
+ elif [[ $para == --data_dump_step* ]];then
+ data_dump_step=`echo ${para#*=}`
+ elif [[ $para == --profiling* ]];then
+ profiling=`echo ${para#*=}`
+ profiling_dump_path=${cur_path}/output/profiling
+ mkdir -p ${profiling_dump_path}
+ elif [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be confing"
+ exit 1
+fi
+
+
+vocab_file=${data_path}/model/vocab.txt
+bert_config_file=${data_path}/model/bert_config.json
+init_checkpoint=${data_path}/model/bert_model.ckpt
+train_file=${data_path}/dataset/train-v1.1_small.json
+predict_file=${data_path}/dataset/dev-v1.1.json
+
+#训练开始时间,不需要修改
+start_time=$(date +%s)
+
+#进入训练脚本目录,需要模型审视修改
+
+#设置环境变量,不需要修改
+echo "Device ID: $RANK_ID"
+
+#创建DeviceID输出目录,不需要修改
+if [ -d $cur_path/output/$ASCEND_DEVICE_ID ];then
+ rm -rf $cur_path/output/$ASCEND_DEVICE_ID
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+else
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+fi
+
+#执行训练脚本,需要模型审视修改
+nohup python3.7 ${parent_path}/run_squad.py \
+ --vocab_file=$vocab_file \
+ --bert_config_file=$bert_config_file \
+ --init_checkpoint=$init_checkpoint \
+ --train_file=$train_file \
+ --do_predict=True \
+ --do_train=True \
+ --predict_file=$predict_file \
+ --train_batch_size=${batch_size} \
+ --num_train_epochs=${epoch} \
+ --num_train_steps=1000 \
+ --learning_rate=3e-5 \
+ --max_seq_length=384 \
+ --doc_stride=128 \
+ --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#E2E训练端到端时长,直接计算,不需要修改
+echo "E2E training Duration sec: $e2e_time"
+
+#训练用例信息,不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+#获取性能数据
+step_per_sec=`grep "global_step/sec:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'global_step/sec:' '{print $2}'|awk 'END {print $1}'`
+ActualFPS=`awk 'BEGIN {printf "%.2f\n", '${step_per_sec}' * '${batch_size}' * '${RANK_SIZE}'}'`
+TrainingTime=`awk 'BEGIN {printf "%.2f\n", '8000' * '${batch_size}' / '${ActualFPS}'}'`
+
+ActualLoss=`grep "loss =" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'loss =' '{print $2}'|awk 'END {print $1}'|tr -d ,`
+
+#关键信息打印到${CaseName}.log中,不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+sed -i -e '/ModuleNotFoundError/d' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log
diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md
index 4b9bb0dc614ea8ce2ea1dacc22b6248aacc45ce5..40d769044324ba3a6917402f4bf7cf4a08797012 100644
--- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md
@@ -151,6 +151,8 @@ python3 ${work_path}/src/utils/create_squad_data.py --train_file=${data_path}/tr
## 模型训练
+#### 模型训练
+
- 单击“立即下载”,并选择合适的下载方式下载源码包。
- 开始训练。
@@ -226,6 +228,26 @@ python3 ${work_path}/src/utils/create_squad_data.py --train_file=${data_path}/tr
bash train_ID3220_BertLarge-Squad2.0_performance_1p.sh --data_path=/home
```
+#### 分布式插件使能分布式
+
+ID0060网络分布式统一训练脚本`./test/train_ID0060_BertBase_performance_distribute.sh`, 该脚本由`./test/train_ID0060_BertBase_performance_8p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改
+
+训练前请下载工具并根据说明完成配置
+
+工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute
+
+
+- 8p训练
+```
+python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_ID0060_BertBase_performance_distribute.sh --data_path=/npu/traindata"
+```
+
+
+- 16p训练
+
+```
+python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_ID0060_BertBase_performance_distribute.sh --data_path=/npu/traindata"
+```
高级参考
diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh
new file mode 100644
index 0000000000000000000000000000000000000000..63e15339588a1cf44bbbcdf974bd411c2c9e000b
--- /dev/null
+++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID0060_BertBase_performance_distribute.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export JOB_ID=99990001
+RANK_ID_START=0
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#基础参数,需要模型审视修改
+#网络名称,同目录名称
+Network="Bert-base_ID0060_for_TensorFlow"
+#训练epoch
+train_epochs=1
+#训练batch_size
+batch_size=128
+#训练step
+train_steps=1000
+#学习率
+learning_rate=
+
+#维测参数,precision_mode需要模型审视修改
+#precision_mode="allow_mix_precision"
+#维持参数,以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+autotune=False
+
+# 帮助信息,不需要修改
+if [[ $1 == --help || $1 == -h ]];then
+ echo"usage:./train_full_1p.sh "
+ echo " "
+ echo "parameter explain:
+ --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+ --over_dump if or not over detection, default is False
+ --data_dump_flag data dump flag, default is False
+ --data_dump_step data dump step, default is 10
+ --profiling if or not profiling for performance debug, default is False
+ --autotune whether to enable autotune, default is False
+ --data_path source data of training
+ -h/--help show help message
+ "
+ exit 1
+fi
+
+#参数校验,不需要修改
+for para in $*
+do
+ if [[ $para == --precision_mode* ]];then
+ precision_mode=`echo ${para#*=}`
+ elif [[ $para == --over_dump* ]];then
+ over_dump=`echo ${para#*=}`
+ over_dump_path=${cur_path}/output/overflow_dump
+ mkdir -p ${over_dump_path}
+ elif [[ $para == --data_dump_flag* ]];then
+ data_dump_flag=`echo ${para#*=}`
+ data_dump_path=${cur_path}/output/data_dump
+ mkdir -p ${data_dump_path}
+ elif [[ $para == --data_dump_step* ]];then
+ data_dump_step=`echo ${para#*=}`
+ elif [[ $para == --profiling* ]];then
+ profiling=`echo ${para#*=}`
+ profiling_dump_path=${cur_path}/output/profiling
+ mkdir -p ${profiling_dump_path}
+ elif [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be confing"
+ exit 1
+fi
+
+#训练开始时间,不需要修改
+start_time=$(date +%s)
+
+#进入训练脚本目录,需要模型审视修改
+
+#设置环境变量,不需要修改
+echo "Device ID: $RANK_ID"
+
+#创建DeviceID输出目录,不需要修改
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+ rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID}
+else
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt${ASCEND_DEVICE_ID}
+fi
+
+ # 绑核,不需要的绑核的模型删除,需要模型审视修改
+corenum=`cat /proc/cpuinfo |grep "processor"|wc -l`
+let a=$ASCEND_DEVICE_ID*${corenum}/${RANK_SIZE}
+let b=$ASCEND_DEVICE_ID+1
+let c=b*${corenum}/${RANK_SIZE}-1
+
+#执行训练脚本,以下传参不需要修改,其他需要模型审视修改
+#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path
+if [ "x${bind_core}" != x ];then
+ bind_core="taskset -c $a-$c"
+fi
+nohup ${bind_core} python3.7 $cur_path/../src/run_pretraining.py --bert_config_file=${cur_path}/../configs/bert_base_config.json \
+--max_seq_length=128 \
+--max_predictions_per_seq=20 \
+--train_batch_size=${batch_size} \
+--learning_rate=1e-4 \
+--num_warmup_steps=0 \
+--num_train_steps=${train_steps} \
+--optimizer_type=adam \
+--manual_fp16=True \
+--use_fp16_cls=True \
+--input_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/training \
+--eval_files_dir=${data_path}/tfrecord/seq_len_128_max_pred_20/wikicorpus_en/test \
+--npu_bert_debug=False \
+--npu_bert_use_tdt=True \
+--do_train=True \
+--num_accumulation_steps=1 \
+--npu_bert_job_start_file= \
+--iterations_per_loop=100 \
+--save_checkpoints_steps=1000 \
+--npu_bert_clip_by_global_norm=False \
+--distributed=True \
+--npu_bert_tail_optimize=True \
+--npu_bert_loss_scale=0 \
+--output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印,不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS,需要模型审视修改
+ActualFPS=`grep Throughput ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END {print $6}'`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}' * '${RANK_SIZE}' / '${ActualFPS}'}'`
+#打印,不需要修改
+echo "Final Performance images/sec : $ActualFPS"
+
+#输出训练精度,需要模型审视修改
+#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'`
+#打印,不需要修改
+#echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息,不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视
+grep "tensorflow:loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss = " '{print $2}' | awk -F "," '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值,不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中,不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md
index 00b2d88975283cfc36a86dfb350967e4b37df0e6..8ef66fccc25480bb818550dc442c0121dcbec775 100644
--- a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md
+++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/README.md
@@ -128,6 +128,9 @@ npu_device.global_options().precision_mode = 'allow_mix_precision'
2. 数据集标注文件需要先后使用scripts目录下coco_convert.py及coco_annotation.py生成。标注文件生成后即内含图片路径及box信息,故数据集图片文件不可随意移动位置。
## 模型训练
+
+#### 模型训练
+
- 单击“立即下载”,并选择合适的下载方式下载源码包。
- 开始训练。
@@ -168,6 +171,27 @@ npu_device.global_options().precision_mode = 'allow_mix_precision'
├─val2017.txt
```
+#### 分布式插件使能分布式
+
+分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改
+
+训练前请下载工具并根据说明完成配置
+
+工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute
+
+
+- 8p训练
+```
+python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata"
+```
+
+
+- 16p训练
+
+```
+python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata"
+```
+
迁移学习指导
- 数据集准备。
diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0b6bdf0a45799304e7239d521e7682a671bd8f17
--- /dev/null
+++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_distribute.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+
+#集合通信参数,不需要修改
+export JOB_ID=10087
+RANK_ID_START=0
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+anno_converted='/npu/traindata/COCO2017/val2017.txt'
+gt_anno_path='/npu/traindata/COCO2017/annotations/instances_val2017.json'
+
+#屏蔽TF2.4升级到TF2.6图差异带来的性能下降
+export NPU_EXECUTE_OP_BY_ACL=false
+
+#设置默认日志级别,不需要修改
+export ASCEND_GLOBAL_LOG_LEVEL_ETP=3
+
+#基础参数 需要模型审视修改
+#网络名称,同目录名称
+Network="YOLOv5_ID1719_for_TensorFlow2.X"
+
+# 训练epoch
+stage1_epoch=0
+stage2_epoch=1
+
+# 训练batchsize
+batch_size=8
+
+train_worker_num=8
+
+# TF2.X独有,不需要修改
+export NPU_LOOPSIZE=1
+
+# 精度模式
+precision_mode='allow_mix_precision'
+#维持参数,不需要修改
+over_dump=False
+over_dump_path=''
+data_dump_flag=False
+data_dump_path=''
+data_dump_step="1"
+profiling=False
+autotune=False
+perf=20
+
+# 帮助信息,不需要修改
+if [[ $1 == --help || $1 == -h ]];then
+ echo"usage:./train_full_8p.sh "
+ echo " "
+ echo "parameter explain:
+ --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+ --over_dump if or not over detection, default is False
+ --data_dump_flag data dump flag, default is 0
+ --data_dump_step data dump step, default is 10
+ --profiling if or not profiling for performance debug, default is False
+ --data_path source data of training
+ -h/--help show help message
+ "
+ exit 1
+fi
+
+#参数校验,不需要修改
+for para in $*
+do
+ if [[ $para == --precision_mode* ]];then
+ precision_mode=`echo ${para#*=}`
+ elif [[ $para == --over_dump* ]];then
+ over_dump=`echo ${para#*=}`
+ over_dump_path=${cur_path}/output/overflow_dump
+ mkdir -p ${over_dump_path}
+ elif [[ $para == --data_dump_flag* ]];then
+ data_dump_flag=`echo ${para#*=}`
+ data_dump_path=${cur_path}/output/data_dump
+ mkdir -p ${data_dump_path}
+ elif [[ $para == --data_dump_step* ]];then
+ data_dump_step=`echo ${para#*=}`
+ elif [[ $para == --profiling* ]];then
+ profiling=`echo ${para#*=}`
+ profiling_dump_path=${cur_path}/output/profiling
+ mkdir -p ${profiling_dump_path}
+ elif [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ elif [[ $para == --bind_core* ]]; then
+ bind_core=`echo ${para#*=}`
+ name_bind="_bindcore"
+ fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be specified"
+ exit 1
+fi
+
+#训练开始时间,不需要修改
+start_time=$(date +%s)
+bind_core=1
+#进入训练脚本目录,需要模型审视修改
+
+#设置环境变量,不需要修改
+echo "Device ID: $RANK_ID"
+
+#创建DeviceID输出目录,不需要修改
+if [ -d ${cur_path}/output/$ASCEND_DEVICE_ID ];then
+ rm -rf ${cur_path}/output/$ASCEND_DEVICE_ID
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+else
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+fi
+cd ${cur_path}/output/$ASCEND_DEVICE_ID/
+#执行训练脚本,需要模型审视修改
+corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l`
+let a=RANK_ID*${corenum}/8
+let b=RANK_ID+1
+let c=b*${corenum}/8-1
+if [ "x${bind_core}" != x ];then
+ bind_core="taskset -c $a-$c"
+fi
+#${bind_core} python3 ../../../train.py --weights='' \
+nohup ${bind_core} python3 ../../../train.py --weights='' \
+ --perf=$perf \
+ --model=yolov5m \
+ --rank=${RANK_ID} \
+ --rank_size=${RANK_SIZE} \
+ --train_worker_num=${train_worker_num} \
+ --data_path=${data_path} \
+ --anno_converted=${anno_converted} \
+ --gt_anno_path=${gt_anno_path} \
+ --batch_size=${batch_size} \
+ --precision_mode=${precision_mode} \
+ --stage1_epoch=${stage1_epoch} \
+ --stage2_epoch=${stage2_epoch} \
+ --over_dump=${over_dump} \
+ --over_dump_path=${over_dump_path} \
+ --data_dump_flag=${data_dump_flag} \
+ --data_dump_step=${data_dump_step} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+echo "------------------ Final result ------------------"
+#输出性能FPS。需要模型审视修改
+epoch_duration=`grep epoch_duration $cur_path/output/0/train_0.log | awk '{print $2}'`
+first_step=`grep duration: $cur_path/output/0/train_0.log |head -1| awk -F "duration:" '{print $2}' |sed s/[[:space:]]//g`
+FPS=`awk 'BEGIN{printf "%.2f\n",('$perf'+'$train_worker_num'-2)/('$epoch_duration'-'$first_step')*'$batch_size'*8}'`
+echo "Final Performance imgs/sec : $FPS"
+
+#训练精度,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改
+# li=`cat $cur_path/output/0/train_0.log | wc -l`
+# num=$(($li - 1))
+# train_accuracy=`sed -n "${num}p" $cur_path/output/0/train_0.log | awk '{print $3}'`
+# echo "Final Train Accuracy : ${train_accuracy}"
+#E2E训练端到端时长,直接计算,不需要修改
+echo "E2E training Duration sec: $e2e_time"
+
+#训练用例信息,不需要修改
+DeviceType=`uname -m`
+CaseName=${Network}${name_bind}_bs${batch_size}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据,不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",('$epoch_duration'-'$first_step')/('$perf'+'$train_worker_num'-2)}'`
+
+##获取Loss,通过train_*.log中关键字,需要根据模型审视
+grep loss $cur_path/output/0/train_0.log|awk '{print $13}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值,不需要修改
+ActualLoss=`grep total_loss: $cur_path/output/0/train_0.log | awk 'END{print $13}'`
+
+#关键信息打印到${CaseName}.log中,不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+# echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+ sed -i "/AttributeError/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log
+done
\ No newline at end of file
diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md
index b9000ac78e1d5e45179372ad0a2407976d8f9590..e8745d95ab8d17243f465b7c01b55552697ce9e1 100644
--- a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md
+++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/ReadMe.md
@@ -176,6 +176,9 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out
文件夹路径需要自己创建。
## 模型训练
+
+#### 模型训练
+
- 下载训练脚本。
- 检查并修改configs/目录下8卡IP的json配置文件“rank_table_8p.json"。
@@ -261,6 +264,27 @@ python3 pack_pretraining_data.py --input-glob="path/to/store/binery/files" --out
4.1 含pack策略的训练脚本(./test/目录下名字带有"_packed"的脚本即为相应包含pack策略的训练脚本)
使用pack策略进行训练时,需使用pack过后的数据集(train、eval)及对应的预训练模型。若无对应tensorflow-v2版本packed预训练模型,可由tensorflow-v1版本进行转换得来。模型转换相关脚本为bert/tf2_encoder_checkpoint_converter.py,详见:脚本和事例代码 - 模型转换脚本
+#### 分布式插件使能分布式
+
+分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p_192bs.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改
+
+训练前请下载工具并根据说明完成配置
+
+工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute
+
+
+- 8p训练
+```
+python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata"
+```
+
+
+- 16p训练
+
+```
+python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata"
+```
+
高级参考
## 脚本和事例代码
diff --git a/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ace12437ec9e1ef484d89d3e7b20bcda4d4b9bc3
--- /dev/null
+++ b/TensorFlow2/built-in/nlp/BertLarge_ID0634_for_TensorFlow2.X/test/train_performance_distribute.sh
@@ -0,0 +1,213 @@
+#'!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export JOB_ID=10087
+RANK_ID_START=0
+
+export NPU_ENABLE_PERF=true
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#基础参数 需要模型审视修改
+#网络名称,同目录名称
+Network="BertLarge_ID0634_for_TensorFlow2.X"
+#训练batch_size
+batch_size=192
+eval_batch_size=16
+#训练step
+train_steps=1000
+#训练epoch
+train_epochs=`expr 768 / ${batch_size}`
+#学习率
+learning_rate=0.000144
+
+#TF2.X独有,需要模型审视修改
+export NPU_LOOP_SIZE=100
+export GE_USE_STATIC_MEMORY=1
+
+#维测参数,precision_mode需要模型审视修改
+precision_mode="allow_mix_precision"
+#维持参数,不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+if [[ $1 == --help || $1 == -h ]];then
+ echo"usage:./train_full_8p_32bs.sh "
+
+ echo " "
+ echo "parameter explain:
+ --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+ --over_dump if or not over detection, default is False
+ --data_dump_flag data dump flag, default is 0
+ --data_dump_step data dump step, default is 10
+ --profiling if or not profiling for performance debug, default is False
+ --data_path source data of training
+ -h/--help show help message
+ "
+ exit 1
+fi
+
+#参数校验,需要模型审视修改
+for para in $*
+do
+ if [[ $para == --precision_mode* ]];then
+ precision_mode=`echo ${para#*=}`
+ elif [[ $para == --over_dump* ]];then
+ over_dump=`echo ${para#*=}`
+ over_dump_path=${cur_path}/output/overflow_dump
+ mkdir -p ${over_dump_path}
+ elif [[ $para == --data_dump_flag* ]];then
+ data_dump_flag=`echo ${para#*=}`
+ data_dump_path=${cur_path}/output/data_dump
+ mkdir -p ${data_dump_path}
+ elif [[ $para == --data_dump_step* ]];then
+ data_dump_step=`echo ${para#*=}`
+ elif [[ $para == --profiling* ]];then
+ profiling=`echo ${para#*=}`
+ profiling_dump_path=${cur_path}/output/profiling
+ mkdir -p ${profiling_dump_path}
+ elif [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ elif [[ $para == --bind_core* ]]; then
+ bind_core=`echo ${para#*=}`
+ name_bind="_bindcore"
+ fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be confing"
+ exit 1
+fi
+
+init_ckpt_path=${data_path}/'tf2_ckpt/model.ckpt-28252' #need modify to actual path
+train_files_path=${data_path}/'train/*' #need modify to actual path
+eval_files_path=${data_path}/'eval/eval.tfrecord' #need modify to actual path
+
+
+
+start_time=$(date +%s)
+#############执行训练#########################
+
+#设置环境变量,不需要修改
+echo "Device ID: $RANK_ID"
+
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+ rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate}
+else
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate}
+fi
+
+#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改
+cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'`
+cpustep=`expr $cpucount / 8`
+echo "taskset c steps:" $cpustep
+let a=$ASCEND_DEVICE_ID*$cpustep
+let b=$ASCEND_DEVICE_ID+1
+let c=b*$cpustep-1
+
+#执行训练脚本,以下传参不需要修改,其他需要模型审视修改
+#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune
+if [ "x${bind_core}" != x ];then
+ bind_core="taskset -c $a-$c"
+fi
+nohup ${bind_core} python3 ../bert/run_pretraining.py \
+--all_reduce_alg=nccl \
+ --bert_config_file=../configs/bert_config.json \
+--beta_1=0.91063 \
+--beta_2=0.96497 \
+--device_warmup=False \
+--do_eval=True \
+--dtype=fp16 \
+--eval_batch_size=${eval_batch_size} \
+--init_checkpoint=${init_ckpt_path} \
+ --train_files=${train_files_path} \
+--eval_files=${eval_files_path} \
+--learning_rate=${learning_rate} \
+--loss_scale=dynamic \
+--max_predictions_per_seq=76 \
+--max_seq_length=512 \
+--model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate} \
+--num_accumulation_steps=1 \
+--distribution_strategy=one_device \
+--num_gpus=1 \
+--enable_checkpoint_and_summary=True \
+ --num_steps_per_epoch=1000 \
+--num_train_epochs=${train_epochs} \
+--optimizer_type=lamb \
+--scale_loss=False \
+--steps_between_eval=100 \
+--steps_per_loop=${NPU_LOOP_SIZE} \
+--stop_steps=200 \
+--train_batch_size=${batch_size} \
+--verbosity=0 \
+--warmup_steps=0 \
+--precision_mode=${precision_mode} \
+--attention_with_dropout_v3=False \
+ --over_dump=${over_dump} \
+ --over_dump_path=${over_dump_path} \
+ --data_dump_flag=${data_dump_flag} \
+ --data_dump_step=${data_dump_step} \
+ --data_dump_path=${data_dump_path} \
+--profiling=${profiling} \
+--profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#############结果处理#########################
+echo "------------------ Final result ------------------"
+#输出性能FPS,需要模型审视修改
+single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'*'${batch_size}'}'`
+#打印,不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5`
+#打印,不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#############冒烟看护#########################
+BatchSize=${batch_size}
+#设备类型
+DeviceType=`uname -m`
+#用例名称
+CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据
+#吞吐量,不需要修改
+ActualFPS=${FPS}
+#单迭代训练时长,不需要修改
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'`
+
+##获取Loss
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中
+grep loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print$11}'|grep -v instead|grep -v masked_lm_loss|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
+sed -i "/AttributeError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log
diff --git a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md
index 83e2cd7183b77310c61b19e823dc217a05731cda..4ba84ba7c86e84af0a30d0ef2922e08a5f978c35 100644
--- a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md
+++ b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/ReadMe.md
@@ -163,6 +163,9 @@ npu_device.global_options().precision_mode=FLAGS.precision_mode
## 模型训练
+
+#### 模型训练
+
- 下载训练脚本。
- 检查scripts/目录下是否有存在8卡IP的json配置文件“rank_table_8p.json"。
@@ -243,7 +246,26 @@ npu_device.global_options().precision_mode=FLAGS.precision_mode
train_performance_8p_49152bs_static_noeval.sh --data_path=${Data_Path}
+#### 分布式插件使能分布式
+
+分布式统一训练脚本`./test/train_performance_distribute.sh`, 该脚本由`./test/train_performance_8p_32768bs_static_noeval.sh`修改而来, 具体差异可自行比对, 分布式插件屏蔽了多P 执行过程中rank_table.json和环境变量的差异, 多P可以共有一个脚本, 具体超参请用户根据实际情况修改
+
+训练前请下载工具并根据说明完成配置
+工具路径: https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/Tools/ascend_distribute
+
+
+- 8p训练
+```
+python3 $path/distrbute_npu.py --np 8 --env 10.10.10.10:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata"
+```
+
+
+- 16p训练
+
+```
+python3 $path/distrbute_npu.py --np 16 --env 10.10.10.10:8,10.10.10.11:8 --train_command "bash train_performance_distribute.sh --data_path=/npu/traindata"
+```
高级参考
diff --git a/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d759a220560101aac4a1b0ad9fd3c65a529c9d49
--- /dev/null
+++ b/TensorFlow2/built-in/nlp/Transformer_ID0633_for_TensorFlow2.X/test/train_performance_distribute.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export JOB_ID=10087
+RANK_ID_START=0
+export PYTHONPATH=../transformer:$PYTHONPATH
+
+export NPU_ENABLE_PERF=true
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#基础参数 需要模型审视修改
+#网络名称,同目录名称
+Network="Transformer_ID0633_for_TensorFlow2.X"
+#训练batch_size
+batch_size=32768
+#训练step
+train_steps=500
+
+#维测参数,precision_mode需要模型审视修改
+precision_mode="allow_mix_precision"
+#维持参数,不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+if [[ $1 == --help || $1 == -h ]];then
+ echo"usage:./train_full_8p_49152bs.sh "
+
+ echo " "
+ echo "parameter explain:
+ --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+ --over_dump if or not over detection, default is False
+ --data_dump_flag data dump flag, default is 0
+ --data_dump_step data dump step, default is 10
+ --profiling if or not profiling for performance debug, default is False
+ --data_path source data of training
+ -h/--help show help message
+ "
+ exit 1
+fi
+
+#参数校验,需要模型审视修改
+for para in $*
+do
+ if [[ $para == --precision_mode* ]];then
+ precision_mode=`echo ${para#*=}`
+ elif [[ $para == --over_dump* ]];then
+ over_dump=`echo ${para#*=}`
+ over_dump_path=${cur_path}/output/overflow_dump
+ mkdir -p ${over_dump_path}
+ elif [[ $para == --data_dump_flag* ]];then
+ data_dump_flag=`echo ${para#*=}`
+ data_dump_path=${cur_path}/output/data_dump
+ mkdir -p ${data_dump_path}
+ elif [[ $para == --data_dump_step* ]];then
+ data_dump_step=`echo ${para#*=}`
+ elif [[ $para == --profiling* ]];then
+ profiling=`echo ${para#*=}`
+ profiling_dump_path=${cur_path}/output/profiling
+ mkdir -p ${profiling_dump_path}
+ elif [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ elif [[ $para == --bind_core* ]]; then
+ bind_core=`echo ${para#*=}`
+ name_bind="_bindcore"
+ fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be confing"
+ exit 1
+fi
+
+start_time=$(date +%s)
+#############执行训练#########################
+
+#设置环境变量,不需要修改
+echo "Device ID: $RANK_ID"
+
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+ rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate}
+else
+ mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt_${learning_rate}
+fi
+
+#绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改
+cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'`
+cpustep=`expr $cpucount / 8`
+echo "taskset c steps:" $cpustep
+let a=$ASCEND_DEVICE_ID*$cpustep
+let b=$ASCEND_DEVICE_ID+1
+let c=b*$cpustep-1
+
+#执行训练脚本,以下传参不需要修改,其他需要模型审视修改
+#--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune
+if [ "x${bind_core}" != x ];then
+ bind_core="taskset -c $a-$c"
+fi
+nohup ${bind_core} python3 ../transformer/official/nlp/transformer/transformer_main.py \
+--data_dir=${data_path} \
+--model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \
+--vocab_file=${data_path}/vocab.ende.32768 \
+--param_set=big \
+--train_steps=${train_steps} \
+--static_batch=true \
+--batch_size=${batch_size} \
+--steps_between_evals=100 \
+--max_length=64 \
+--mode=train \
+--decode_batch_size=32 \
+--decode_max_length=97 \
+--padded_decode=False \
+--num_gpus=1 \
+--dtype=fp16 \
+--distribution_strategy='one_device' \
+--enable_time_history=true \
+--log_steps=100 \
+--loss_scale='dynamic' \
+--precision_mode=${precision_mode} \
+ --over_dump=${over_dump} \
+ --over_dump_path=${over_dump_path} \
+ --data_dump_flag=${data_dump_flag} \
+ --data_dump_step=${data_dump_step} \
+ --data_dump_path=${data_dump_path} \
+--profiling=${profiling} \
+--profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#############结果处理#########################
+echo "------------------ Final result ------------------"
+#输出性能FPS,需要模型审视修改
+single_batch_step_sec=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $8}'|tail -n +2|awk '{sum+=$1} END {print sum/NR}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'${single_batch_step_sec}'}'`
+#打印,不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'|sed 's/,//g'|cut -c 1-5`
+#打印,不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#############冒烟看护#########################
+BatchSize=${batch_size}
+#设备类型
+DeviceType=`uname -m`
+#用例名称
+CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据
+#吞吐量,不需要修改
+ActualFPS=${FPS}
+#单迭代训练时长,不需要修改
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'`
+
+##获取Loss
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中
+grep 'Train history' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}'| sed 's/\[//g'|sed 's/\]}//g' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log