diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/run_squad.py b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/run_squad.py
index c97161572146ed03bf63e757c09cd314fa9fcc1e..34e99463223734a1cc870fbc313e071c0a1698f8 100644
--- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/run_squad.py
+++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/run_squad.py
@@ -44,6 +44,8 @@ from utils import is_main_process, format_step
 import dllogger, time
 from apex.optimizers import npu_fused_bert_adam, NpuFusedBertAdam
 
+RANK = int(os.getenv('RANK'))
+
 # torch._C._jit_set_profiling_mode(False)
 # torch._C._jit_set_profiling_executor(False)
 
@@ -507,7 +509,7 @@ def get_answers(examples, features, results, args):
 
         # In very rare edge cases we could only have single null prediction.
         # So we just create a nonce prediction in this case to avoid failure.
-        if not nbest:                                                    
+        if not nbest:
             nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0))
 
         total_scores = []
@@ -555,7 +557,7 @@ def get_answer_text(example, feature, pred, args):
     return final_text
 
 def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args):
-    
+
     _PrelimPrediction = collections.namedtuple(
         "PrelimPrediction",
         ["start_index", "end_index", "start_logit", "end_logit"])
@@ -733,7 +735,7 @@ def _compute_softmax(scores):
 # from apex.multi_tensor_apply import multi_tensor_applier
 # class GradientClipper:
 #     """
-#     Clips gradient norm of an iterable of parameters. 
+#     Clips gradient norm of an iterable of parameters.
 #     """
 #     def __init__(self, max_grad_norm):
 #         self.max_norm = max_grad_norm
@@ -897,7 +899,7 @@ def main():
                         help="addr used for distributed training")
 
     args = parser.parse_args()
-    args.fp16 = args.fp16 or args.amp    
+    args.fp16 = args.fp16 or args.amp
 
     if args.local_rank == -1 or args.no_cuda:
         if args.use_npu:
@@ -913,7 +915,8 @@ def main():
             os.environ['MASTER_PORT'] = '29668'
             torch.npu.set_device("npu:%d" % args.local_rank)
             device = torch.device("npu:%d" % args.local_rank)
-            torch.distributed.init_process_group(backend='hccl', world_size=8, rank=args.local_rank)
+            print("the RANK is :", RANK)
+            torch.distributed.init_process_group(backend='hccl', world_size=args.num_npu, rank=RANK)
             n_npu = 1
         else:
             torch.cuda.set_device(args.local_rank)
@@ -928,7 +931,7 @@ def main():
                                 dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
     else:
         dllogger.init(backends=[])
-        
+
     # print("device: {} n_npu: {}, distributed training: {}, 16-bits training: {}".format(
     #                             device, n_npu, bool(args.local_rank != -1), args.fp16))
     print("train on device {}, rank {}".format(device, args.local_rank))
@@ -1148,7 +1151,7 @@ def main():
                 else:
                     loss.backward()
 
- 
+
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16 :
                         # modify learning rate with special warm up for BERT which FusedAdam doesn't do
@@ -1280,4 +1283,4 @@ def main():
 
 if __name__ == "__main__":
     main()
-    dllogger.flush()
+    dllogger.flush()
\ No newline at end of file
diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_8p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_8p.sh
index 97d53c33fc3d6626ad1538182c3a6571451a9650..3b555b4232b1b1a109755b3a6e20836f4907c88d 100644
--- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_full_8p.sh
@@ -96,6 +96,7 @@ then
   do
   let p_start=0+24*i
   let p_end=23+24*i
+  export RANK=${i}
   if [ -d ${cur_path}/output/${i} ];then
         rm -rf ${cur_path}/output/${i}
         mkdir -p ${cur_path}/output/$i
@@ -107,6 +108,7 @@ then
 else
   for i in $(seq 0 7)
   do
+  export RANK=${i}
   if [ -d ${cur_path}/output/${i} ];then
       rm -rf ${cur_path}/output/${i}
       mkdir -p ${cur_path}/output/$i
diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_16p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..751f6a47637d43d858208d9daeb44f1eea0976ef
--- /dev/null
+++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_16p.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+#source ../env_npu.sh
+
+data_path=""
+conf_path=""
+server_index=""
+fix_node_ip=""
+#集合通信参数,不需要修改
+
+export RANK_SIZE=16
+
+#基础参数，需要模型审视修改
+#网络名称，同目录名称
+Network="BertBase_ID0490_for_PyTorch"
+#训练batch_size
+batch_size=80
+
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    elif [[ $para == --over_dump* ]];then
+        over_dump=`echo ${para#*=}`
+        over_dump_path=${cur_path}/output/overflow_dump
+        mkdir -p ${over_dump_path}
+    elif [[ $para == --data_dump_flag* ]];then
+        data_dump_flag=`echo ${para#*=}`
+        data_dump_path=${cur_path}/output/data_dump
+        mkdir -p ${data_dump_path}
+    elif [[ $para == --data_dump_step* ]];then
+        data_dump_step=`echo ${para#*=}`
+    elif [[ $para == --profiling* ]];then
+        profiling=`echo ${para#*=}`
+        profiling_dump_path=${cur_path}/output/profiling
+        mkdir -p ${profiling_dump_path}
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --ckpt_path* ]];then
+        ckpt_path=`echo ${para#*=}`
+    elif [[ $para == --conf_path* ]];then
+            conf_path=`echo ${para#*=}`
+    elif [[ $para == --server_index* ]];then
+            server_index=`echo ${para#*=}`
+    elif [[ $para == --fix_node_ip* ]];then
+            fix_node_ip=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'`
+linux_num=`find $conf_path -name "server_*.info" |wc -l`
+
+export HCCL_IF_IP=$fix_node_ip
+export MASTER_ADDR=$one_node_ip
+
+rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'`
+export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'`
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/../
+cur_1=${1:-"1"}
+cur_2=${2:-"2"}
+cur_3=${3:-"3"}
+cur_4=${4:-"4"}
+init_checkpoint=${5:-"`${data_path}/pretrained/bert_base_pretrain.pt`"}
+epochs=${6:-"1.0"}
+batch_size=${7:-"80"}
+learning_rate=${8:-"2e-4"}
+precision=${9:-"fp16"}
+num_npu=${10:-"16"}
+seed=${11:-"1"}
+squad_dir=${12:-"`${data_path}/squad/v1.1`"}
+vocab_file=${13:-"data/uncased_L-24_H-1024_A-16/vocab.txt"}
+OUT_DIR=${14:-"results/SQuAD"}
+mode=${15:-"train eval"}
+CONFIG_FILE=${16:-"bert_base_config.json"}
+max_steps=${17:-"-1"}
+
+echo "out dir is $OUT_DIR"
+mkdir -p $OUT_DIR
+if [ ! -d "$OUT_DIR" ]; then
+  echo "ERROR: non existing $OUT_DIR"
+  exit 1
+fi
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+  echo "fp16 activated!"
+  use_fp16=" --fp16 "
+fi
+
+CMD="python3.7 run_squad.py "
+CMD+="--init_checkpoint=${data_path}/pretrained/bert_base_pretrain.pt "
+if [ "$mode" = "train" ] ; then
+  CMD+="--do_train "
+  CMD+="--train_file=${data_path}/squad/v1.1/train-v1.1.json "
+  CMD+="--train_batch_size=$batch_size "
+elif [ "$mode" = "eval" ] ; then
+  CMD+="--do_predict "
+  CMD+="--predict_file=${data_path}/squad/v1.1/dev-v1.1.json "
+  CMD+="--predict_batch_size=$batch_size "
+  CMD+="--eval_script=${data_path}/squad/v1.1/evaluate-v1.1.py "
+  CMD+="--do_eval "
+elif [ "$mode" = "prediction" ] ; then
+  CMD+="--do_predict "
+  CMD+="--predict_file=${data_path}/squad/v1.1/dev-v1.1.json "
+  CMD+="--predict_batch_size=$batch_size "
+else
+  CMD+=" --do_train "
+  CMD+=" --train_file=${data_path}/squad/v1.1/train-v1.1.json "
+  CMD+=" --train_batch_size=$batch_size "
+  CMD+="--do_predict "
+  CMD+="--predict_file=${data_path}/squad/v1.1/dev-v1.1.json "
+  CMD+="--predict_batch_size=$batch_size "
+  CMD+="--eval_script=${data_path}/squad/v1.1/evaluate-v1.1.py "
+  CMD+="--do_eval "
+fi
+
+CMD+=" --do_lower_case "
+CMD+=" --bert_model=bert-large-uncased "
+CMD+=" --learning_rate=$learning_rate "
+CMD+=" --seed=$seed "
+CMD+=" --num_train_epochs=$epochs "
+CMD+=" --max_seq_length=384 "
+CMD+=" --doc_stride=128 "
+CMD+=" --output_dir=$OUT_DIR "
+CMD+=" --vocab_file=$vocab_file "
+CMD+=" --config_file=$CONFIG_FILE "
+CMD+=" --max_steps=$max_steps "
+CMD+=" $use_fp16"
+CMD+=" --use_npu"
+CMD+=" --num_npu=$num_npu"
+CMD+=" --loss_scale=4096"
+CMD+=" --addr=$one_node_ip"
+
+if [ $(uname -m) = "aarch64" ]
+then
+  for i in $(seq 0 7)
+  do
+  let p_start=0+24*i
+  let p_end=23+24*i
+  export RANK=`awk 'BEGIN{printf "%.0f\n",'${i}'+'${rank_server}'}'`
+  if [ -d ${cur_path}/output/${i} ];then
+        rm -rf ${cur_path}/output/${i}
+        mkdir -p ${cur_path}/output/$i
+  else
+        mkdir -p ${cur_path}/output/$i
+  fi
+  taskset -c $p_start-$p_end $CMD --local_rank=$i > ${cur_path}/output/${i}/train_${i}.log 2>&1 &
+  done
+else
+  for i in $(seq 0 7)
+  do
+  if [ -d ${cur_path}/output/${i} ];then
+        rm -rf ${cur_path}/output/${i}
+        mkdir -p ${cur_path}/output/$i
+  else
+        mkdir -p ${cur_path}/output/$i
+  fi
+  $CMD --local_rank=$i > ${cur_path}/output/${i}/train_${i}.log 2>&1 &
+  done
+fi
+wait
+
+ASCEND_DEVICE_ID=0
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+iter=`grep 'Epoch: ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "iter/s :" '{print $NF}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'${iter}'*16*'${batch_size}'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+rm -rf ${data_path}/squad/v1.1/train-v1.1.json_bert-large-uncased_384_128_64
\ No newline at end of file
diff --git a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_8p.sh b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_8p.sh
index 1e7536648a7f2a03bd1877a3c095cb3617ec51db..d38de0ad260014dfb92fe9b1b48de0894f0a5965 100644
--- a/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_8p.sh
+++ b/PyTorch/dev/cv/image_classification/BertBase_ID0490_for_PyTorch/test/train_performance_8p.sh
@@ -130,6 +130,7 @@ then
   do
   let p_start=0+24*i
   let p_end=23+24*i
+  export RANK=${i}
   if [ -d ${cur_path}/output/${i} ];then
         rm -rf ${cur_path}/output/${i}
         mkdir -p ${cur_path}/output/$i
@@ -141,6 +142,7 @@ then
 else
   for i in $(seq 0 7)
   do
+  export RANK=${i}
   if [ -d ${cur_path}/output/${i} ];then
         rm -rf ${cur_path}/output/${i}
         mkdir -p ${cur_path}/output/$i