diff --git a/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/main/train_npu_rt.py b/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/main/train_npu_rt.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fa6753260e56a296e3ea4075dd224be875bd3a4
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/main/train_npu_rt.py
@@ -0,0 +1,249 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import os
+import sys
+import time
+
+import tensorflow as tf
+import numpy as np
+sys.path.append(os.getcwd())
+
+cur_path = os.path.abspath(os.path.dirname(__file__))
+working_dir = os.path.join(cur_path, '../')
+sys.path.append(working_dir)
+
+from tensorflow.contrib import slim
+
+tf.app.flags.DEFINE_float('learning_rate', 1e-5, '')
+tf.app.flags.DEFINE_integer('max_steps', 50000, '')
+tf.app.flags.DEFINE_integer('decay_steps', 30000, '')
+tf.app.flags.DEFINE_float('decay_rate', 0.1, '')
+tf.app.flags.DEFINE_float('moving_average_decay', 0.997, '')
+tf.app.flags.DEFINE_integer('num_readers', 4, '')
+tf.app.flags.DEFINE_string('gpu', '0', '')
+tf.app.flags.DEFINE_string('checkpoint_path',"checkpoints_mlt/" , '')
+tf.app.flags.DEFINE_string('logs_path', 'logs_mlt/', '')
+tf.app.flags.DEFINE_string('pretrained_model_path', 'data/vgg_16.ckpt', '')
+tf.app.flags.DEFINE_boolean('restore', False, '')
+tf.app.flags.DEFINE_integer('save_checkpoint_steps', 2000, '')
+tf.app.flags.DEFINE_string('dataset_dir', 'resized/', '')
+tf.app.flags.DEFINE_integer('num_bbox', 256, '')
+tf.app.flags.DEFINE_integer('loss_scale', 4096, '')
+tf.app.flags.DEFINE_integer('inputs_height', 600, '')
+tf.app.flags.DEFINE_integer('inputs_width', 900, '')
+tf.app.flags.DEFINE_integer('device_id', 1, '')
+tf.app.flags.DEFINE_integer('npu_nums', 1, '')
+tf.app.flags.DEFINE_string('DEVICE_ID', '0', '')
+#modify for NPU start
+tf.app.flags.DEFINE_string('precision_mode', 'allow_fp32_to_fp16', '')
+#modify for NPU end
+
+FLAGS = tf.app.flags.FLAGS
+
+
+from nets import model_train as model
+from utils.dataset import data_provider as data_provider
+from hccl.split.api import set_split_strategy_by_size
+# npu libs
+from npu_bridge.estimator import npu_ops
+from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig
+from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
+from npu_bridge.estimator.npu.npu_optimizer import allreduce
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+from npu_bridge.hccl import hccl_ops
+from npu_bridge.estimator.npu.npu_loss_scale_optimizer import NPULossScaleOptimizer
+from npu_bridge.estimator.npu.npu_loss_scale_manager import FixedLossScaleManager
+
+from tensorflow.python.client import timeline
+
+# modify for NPU start
+from npu_bridge.npu_init import *
+# modify for NPU end
+
+def pad_input(inputs,target_shape=[1216,1216,3]):
+
+    h,w = inputs.shape[:2]
+    out = np.zeros(target_shape).astype(np.uint8)
+    out[0:h,0:w,:] = inputs
+
+    return out
+
+
+def pad_bbox(inputs, count=256):
+    if len(inputs)>count:
+        return inputs[:count].copy()
+   
+    else:    
+        out = inputs.copy()
+        num_inputs = len(out)
+        num_pad = count - num_inputs
+        
+        for i in range(num_pad):
+            out.append([0,0,0,0,1])
+        return out
+
+
+def broadcast_global_variables(root_rank, index):
+    op_list = []
+    for var in tf.global_variables():
+        if "float" in var.dtype.name:
+            inputs = [var]
+            outputs = hccl_ops.broadcast(tensor=inputs, root_rank=root_rank)
+            if outputs is not None:
+                op_list.append(outputs[0].op)
+                op_list.append(tf.assign(var, outputs[0]))
+    return tf.group(op_list)
+
+def main(argv=None):
+    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
+    now = datetime.datetime.now()
+    StyleTime = now.strftime("%Y-%m-%d-%H-%M-%S")
+    os.makedirs(FLAGS.logs_path + FLAGS.DEVICE_ID)
+    if not os.path.exists(FLAGS.checkpoint_path):
+        os.makedirs(FLAGS.checkpoint_path)
+
+    input_image = tf.placeholder(tf.float32, 
+            shape=[1,FLAGS.inputs_height, FLAGS.inputs_width, 3], 
+            name='input_image')
+    input_bbox = tf.placeholder(tf.float32, 
+            shape=[FLAGS.num_bbox, 5], name='input_bbox')
+
+    global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
+    learning_rate = tf.Variable(FLAGS.learning_rate, trainable=False)
+    tf.summary.scalar('learning_rate', learning_rate)
+    opt = tf.train.AdamOptimizer(learning_rate)
+    if FLAGS.npu_nums == 8:
+        opt = NPUDistributedOptimizer(opt)
+
+ #    opt = NPUDistributedOptimizer(opt)
+    # modify for NPU start
+    if FLAGS.precision_mode == "allow_mix_precision":
+        loss_scale_manager = ExponentialUpdateLossScaleManager(
+            init_loss_scale=2**32,
+            incr_every_n_steps=1000,
+            decr_every_n_nan_or_inf=2,
+            decr_ratio=0.5)
+    else:
+        loss_scale_manager = FixedLossScaleManager(loss_scale=FLAGS.loss_scale)
+    # modify for NPU end
+
+    opt = NPULossScaleOptimizer(opt, loss_scale_manager)
+    
+
+    with tf.name_scope('model' ) as scope:
+        bbox_pred, cls_pred, cls_prob = model.model(input_image)
+
+        total_loss, model_loss, rpn_cross_entropy, rpn_loss_box = model.loss_v2(bbox_pred, cls_pred, input_bbox)
+                                                                             
+        batch_norm_updates_op = tf.group(*tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope))
+        grads = opt.compute_gradients(total_loss)
+
+    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+    summary_op = tf.summary.merge_all()
+    variable_averages = tf.train.ExponentialMovingAverage(
+        FLAGS.moving_average_decay, global_step)
+    variables_averages_op = variable_averages.apply(tf.trainable_variables())
+    with tf.control_dependencies([variables_averages_op, apply_gradient_op, batch_norm_updates_op]):
+        train_op = tf.no_op(name='train_op')
+
+    saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
+    summary_writer = tf.summary.FileWriter(FLAGS.logs_path + StyleTime, tf.get_default_graph())
+
+    init = tf.global_variables_initializer()
+
+    if FLAGS.pretrained_model_path is not None:
+        variable_restore_op = slim.assign_from_checkpoint_fn(FLAGS.pretrained_model_path,
+                                                             slim.get_trainable_variables(),
+                                                           ignore_missing_vars=True)
+    #for NPU
+    config = tf.ConfigProto(allow_soft_placement=True)
+    custom_op =  config.graph_options.rewrite_options.custom_optimizers.add()
+    custom_op.name =  "NpuOptimizer"
+    custom_op.parameter_map["use_off_line"].b = True
+    custom_op.parameter_map["hcom_parallel"].b = True
+    custom_op.parameter_map["jit_compile"].b = False
+    config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+    # modify for NPU start
+    if FLAGS.precision_mode == "allow_mix_precision":
+        custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
+    # modify for NPU end
+    # for NPU
+    if FLAGS.npu_nums == 8:
+        bcast_op = broadcast_global_variables(0, 1)
+    with tf.Session(config=config) as sess:
+        if FLAGS.npu_nums == 8:
+            sess.run(bcast_op)
+        if FLAGS.restore:
+            ckpt = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
+            restore_step = int(ckpt.split('.')[0].split('_')[-1])
+            print("continue training from previous checkpoint {}".format(restore_step))
+            saver.restore(sess, ckpt)
+        else:
+            sess.run(init)
+            if FLAGS.npu_nums == 8:
+                set_split_strategy_by_size([80, 20])
+            restore_step = 0
+            if FLAGS.pretrained_model_path is not None:
+                variable_restore_op(sess)
+        data_generator = data_provider.get_batch(num_workers=FLAGS.num_readers)
+        start = time.time()
+
+        for step in range(restore_step, FLAGS.max_steps):
+            data = next(data_generator)
+            inputs_padded = data[0]
+            bbox_padded = pad_bbox(data[1],FLAGS.num_bbox)
+            input_image_np = inputs_padded
+            input_bbox_np = bbox_padded
+            
+            ml, tl,ce_loss, bbox_loss, _, summary_str = sess.run([model_loss, total_loss,
+                                               rpn_cross_entropy,
+                                               rpn_loss_box,
+                                               train_op, summary_op],
+                                              feed_dict={input_image: input_image_np,
+                                                         input_bbox: input_bbox_np})
+            summary_writer.add_summary(summary_str, global_step=step)
+            print('model loss :', ml, 'ce_loss: ', ce_loss, 'box_loss:',bbox_loss)
+            if step != 0 and step % FLAGS.decay_steps == 0:
+                sess.run(tf.assign(learning_rate, learning_rate.eval() * FLAGS.decay_rate))
+
+            if step % 10 == 0:
+                avg_time_per_step = (time.time() - start) / 10
+                start = time.time()
+                print('Step {:06d}, ce_loss {:.6f}, bbox_loss {:.6f}  model loss {:.4f}, total loss {:.4f}, {:.2f} seconds/step, LR: {:.6f}'.format(
+                    step, ce_loss, bbox_loss, ml, tl, avg_time_per_step, learning_rate.eval()))
+
+            if (step + 1) % FLAGS.save_checkpoint_steps == 0:
+                filename = ('ctpn_{:d}'.format(step + 1) + '.ckpt')
+                filename = os.path.join(FLAGS.checkpoint_path, filename)
+                saver.save(sess, filename)
+                print('Write model to: {:s}'.format(filename))
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/test/train_RT2_performance_1p.sh b/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/test/train_RT2_performance_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..101e3c01e02b60ba26bb0decf01c475a57e9eb11
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/test/train_RT2_performance_1p.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+
+export RANK_SIZE=1
+export JOB_ID=10087
+RANK_ID_START=0
+export PATH=$PATH:/autotest/anaconda3/bin
+source activate python3.7.5
+
+#使能RT2.0
+export ENABLE_RUNTIME_V2=1
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+
+#基础参数，需要模型审视修改
+#网络名称，同目录名称
+Network="CTPN_ID0054_for_TensorFlow"
+#训练epoch
+#train_epochs=1
+#训练batch_size
+batch_size=1
+#训练step
+train_steps=100
+save_checkpoint_steps=20
+#学习率
+learning_rate=1e-5
+
+#TF2.X独有，不需要修改
+#export NPU_LOOP_SIZE=${train_steps}
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="allow_mix_precision"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+# 帮助信息，不需要修改
+if [[ $1 == --help || $1 == -h ]];then
+    echo"usage:./train_performance_1p.sh <args>"
+    echo " "
+    echo "parameter explain:
+    --precision_mode         precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+    --over_dump		         if or not over detection, default is False
+    --data_dump_flag	     data dump flag, default is False
+    --data_dump_step		 data dump step, default is 10
+    --profiling		         if or not profiling for performance debug, default is False
+    --data_path		         source data of training
+    -h/--help		         show help message
+    "
+    exit 1
+fi
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    elif [[ $para == --over_dump* ]];then
+        over_dump=`echo ${para#*=}`
+        over_dump_path=${cur_path}/output/overflow_dump
+        mkdir -p ${over_dump_path}
+    elif [[ $para == --data_dump_flag* ]];then
+        data_dump_flag=`echo ${para#*=}`
+        data_dump_path=${cur_path}/output/data_dump
+        mkdir -p ${data_dump_path}
+    elif [[ $para == --data_dump_step* ]];then
+        data_dump_step=`echo ${para#*=}`
+    elif [[ $para == --profiling* ]];then
+        profiling=`echo ${para#*=}`
+        profiling_dump_path=${cur_path}/output/profiling
+        mkdir -p ${profiling_dump_path}
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+#CTPN独有
+cd $cur_path/../
+cd utils/bbox/
+chmod +x make.sh
+./make.sh
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/..
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    #设置环境变量，不需要修改
+    echo "Device ID: $ASCEND_DEVICE_ID"
+    export RANK_ID=$RANK_ID
+
+    #创建DeviceID输出目录，不需要修改
+    if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+    else
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID
+    fi
+    #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+    #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path，--data_dump_flag，--data_dump_step，--data_dump_path，--profiling，--profiling_dump_path，--autotune
+    nohup python3 main/train_npu_rt.py \
+        --precision_mode=$precision_mode \
+        --pretrained_model_path=$data_path/vgg_16.ckpt \
+        --dataset_dir=$data_path \
+        --max_steps=$train_steps \
+        --save_checkpoint_steps=$save_checkpoint_steps \
+        --checkpoint_path=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#CTPN独有
+bash eval.sh ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt/ctpn_$train_steps.ckpt >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1
+
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+train_time=`grep Step  $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'1000'*'${batch_size}'/'${train_time}'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep Calculated $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}' | awk -F '}' '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'perf'
+##获取性能数据
+#吞吐量，不需要修改
+ActualFPS=${FPS}
+#单迭代训练时长，不需要修改
+TrainingTime=$train_time
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "total loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $12}' | awk -F ',' '{print $1}'>> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+sed -i "s/ModuleNotFoundError: No module named 'impl.unsorted_segment_sum'/ /g" `grep ModuleNotFoundError -rl $cur_path/output/$ASCEND_DEVICE_ID/train_*.log`
diff --git a/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/test/train_RT2_performance_8p.sh b/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/test/train_RT2_performance_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cfd98d12c0b12172db0faaf581552bffb3e7a340
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/CTPN_ID0054_for_TensorFlow/test/train_RT2_performance_8p.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+export RANK_TABLE_FILE=$cur_path/${RANK_SIZE}p.json
+export JOB_ID=10087
+export DEVICE_INDEX=0
+RANK_ID_START=0
+
+#使能RT2.0
+export ENABLE_RUNTIME_V2=1
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+#设置默认日志级别,不需要修改
+export ASCEND_GLOBAL_LOG_LEVEL_ETP=3
+
+
+
+Network="CTPN_ID0054_for_TensorFlow"
+#训练epoch
+#train_epochs=1
+#训练batch_size
+batch_size=1
+#训练step
+train_steps=100
+save_checkpoint_steps=20
+#学习率
+learning_rate=8e-5
+
+
+#TF2.X独有，不需要修改
+#export NPU_LOOP_SIZE=${train_steps}
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="allow_mix_precision"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+
+if [[ $1 == --help || $1 == -h ]];then 
+    echo "usage: ./train_performance_8p.sh <args>"
+
+    echo ""
+    echo "parameter explain:
+    --task_name           finetune dataset
+    --data_path           source data of training
+    --model_path          the path of pretrain ckpt
+    --train_batch_size    training batch
+    --learning_rate       learning_rate
+    --num_train_epochs    epochs
+    --output_dir          output dir
+    -h/--help             Show help message
+    "
+    exit 1
+fi
+
+if [   -d $cur_path/output ];then
+   rm -rf $cur_path/output/*
+   mkdir -p $cur_path/output/$ASCEND_DEVICE_ID
+else
+   mkdir -p $cur_path/output/$ASCEND_DEVICE_ID
+fi
+
+
+for para in $*
+do
+    if [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    elif [[ $para == --over_dump* ]];then
+        over_dump=`echo ${para#*=}`
+        over_dump_path=${cur_path}/output/overflow_dump
+        mkdir -p ${over_dump_path}
+    elif [[ $para == --data_dump_flag* ]];then
+        data_dump_flag=`echo ${para#*=}`
+        data_dump_path=${cur_path}/output/data_dump
+        mkdir -p ${data_dump_path}
+    elif [[ $para == --data_dump_step* ]];then
+        data_dump_step=`echo ${para#*=}`
+    elif [[ $para == --profiling* ]];then
+        profiling=`echo ${para#*=}`
+        profiling_dump_path=${cur_path}/output/profiling
+        mkdir -p ${profiling_dump_path}
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+#CTPN独有
+cd $cur_path/../
+cd utils/bbox/
+chmod +x make.sh
+./make.sh
+
+
+
+#############执行训练#########################
+start=$(date +%s)
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    #设置环境变量，不需要修改
+    echo "Device ID: $ASCEND_DEVICE_ID"
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    
+    #创建DeviceID输出目录，不需要修改
+    if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+    else
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+    fi
+  
+    #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+    cd $cur_path/../
+    nohup python3 main/train_npu_rt.py \
+        --precision_mode=$precision_mode \
+        --pretrained_model_path=$data_path/vgg_16.ckpt \
+        --dataset_dir=$data_path \
+        --max_steps=$train_steps \
+        --device_id=$ASCEND_DEVICE_ID \
+        --npu_nums=$RANK_SIZE \
+		--DEVICE_ID=$ASCEND_DEVICE_ID \
+        --save_checkpoint_steps=$save_checkpoint_steps \
+        --checkpoint_path=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done 
+wait
+
+end=$(date +%s)
+e2etime=$(( $end - $start ))
+
+
+
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+train_time=`grep Step  $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'1000'*'${batch_size}'/'${train_time}'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep Calculated $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}' | awk -F '}' '{print $1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'perf'
+##获取性能数据
+#吞吐量，不需要修改
+ActualFPS=${FPS}
+#单迭代训练时长，不需要修改
+TrainingTime=$train_time
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "total loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $12}' | awk -F ',' '{print $1}'>> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2etime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+sed -i "s/ModuleNotFoundError: No module named 'impl.unsorted_segment_sum'/ /g" `grep ModuleNotFoundError -rl $cur_path/output/$ASCEND_DEVICE_ID/train_*.log`
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_lib_rt.py b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_lib_rt.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf6d3834cbc6e3e1d7b52dee35bde797eb25d04b
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_lib_rt.py
@@ -0,0 +1,393 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+'Constructs model, inputs, and training environment.'
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from npu_bridge.npu_init import *
+from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig
+from tensorflow.core.protobuf import config_pb2
+import copy
+import functools
+import os
+import tensorflow as tf
+#import horovod.tensorflow as hvd
+from object_detection import eval_util
+from object_detection import exporter as exporter_lib
+from object_detection import inputs
+from object_detection.builders import graph_rewriter_builder
+from object_detection.builders import model_builder
+from object_detection.builders import optimizer_builder
+from object_detection.core import standard_fields as fields
+from object_detection.utils import config_util
+from object_detection.utils import label_map_util
+from object_detection.utils import shape_utils
+from object_detection.utils import variables_helper
+from object_detection.utils import visualization_utils as vis_utils
+
+#2021.5版本升级，下面代码版本包归档
+'''class NpuEmptyHook(tf.train.SessionRunHook):
+    pass
+
+def npu_tf_optimizer(opt):
+    npu_opt = NPUDistributedOptimizer(opt)
+    return npu_opt
+
+def npu_session_config_init(session_config=None):
+    if ((not isinstance(session_config, config_pb2.ConfigProto)) and (not issubclass(type(session_config), config_pb2.ConfigProto))):
+        session_config = config_pb2.ConfigProto()
+    if (isinstance(session_config, config_pb2.ConfigProto) or issubclass(type(session_config), config_pb2.ConfigProto)):
+        custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add()
+        custom_op.name = 'NpuOptimizer'
+        session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+    return session_config
+
+def npu_run_config_init(run_config=None):
+    if ((not isinstance(run_config, tf.estimator.RunConfig)) and (not issubclass(type(run_config), tf.estimator.RunConfig))):
+        run_config = tf.estimator.RunConfig()
+    if (isinstance(run_config, tf.estimator.RunConfig) or issubclass(type(run_config), tf.estimator.RunConfig)):
+        run_config.__dict__['_session_config'] = npu_session_config_init(run_config.session_config)
+    return run_config'''
+MODEL_BUILD_UTIL_MAP = {'get_configs_from_pipeline_file': config_util.get_configs_from_pipeline_file, 'create_pipeline_proto_from_configs': config_util.create_pipeline_proto_from_configs, 'merge_external_params_with_configs': config_util.merge_external_params_with_configs, 'create_train_input_fn': inputs.create_train_input_fn, 'create_eval_input_fn': inputs.create_eval_input_fn, 'create_predict_input_fn': inputs.create_predict_input_fn}
+
+def _prepare_groundtruth_for_eval(detection_model, class_agnostic, max_number_of_boxes):
+    "Extracts groundtruth data from detection_model and prepares it for eval.\n\n  Args:\n    detection_model: A `DetectionModel` object.\n    class_agnostic: Whether the detections are class_agnostic.\n    max_number_of_boxes: Max number of groundtruth boxes.\n\n  Returns:\n    A tuple of:\n    groundtruth: Dictionary with the following fields:\n      'groundtruth_boxes': [batch_size, num_boxes, 4] float32 tensor of boxes,\n        in normalized coordinates.\n      'groundtruth_classes': [batch_size, num_boxes] int64 tensor of 1-indexed\n        classes.\n      'groundtruth_masks': 4D float32 tensor of instance masks (if provided in\n        groundtruth)\n      'groundtruth_is_crowd': [batch_size, num_boxes] bool tensor indicating\n        is_crowd annotations (if provided in groundtruth).\n      'num_groundtruth_boxes': [batch_size] tensor containing the maximum number\n        of groundtruth boxes per image..\n    class_agnostic: Boolean indicating whether detections are class agnostic.\n  "
+    input_data_fields = fields.InputDataFields()
+    groundtruth_boxes = tf.stack(detection_model.groundtruth_lists(fields.BoxListFields.boxes))
+    groundtruth_boxes_shape = tf.shape(groundtruth_boxes)
+    if class_agnostic:
+        groundtruth_classes_one_hot = tf.ones([groundtruth_boxes_shape[0], groundtruth_boxes_shape[1], 1])
+    else:
+        groundtruth_classes_one_hot = tf.stack(detection_model.groundtruth_lists(fields.BoxListFields.classes))
+    label_id_offset = 1
+    groundtruth_classes = (tf.argmax(groundtruth_classes_one_hot, axis=2) + label_id_offset)
+    groundtruth = {input_data_fields.groundtruth_boxes: groundtruth_boxes, input_data_fields.groundtruth_classes: groundtruth_classes}
+    if detection_model.groundtruth_has_field(fields.BoxListFields.masks):
+        groundtruth[input_data_fields.groundtruth_instance_masks] = tf.stack(detection_model.groundtruth_lists(fields.BoxListFields.masks))
+    if detection_model.groundtruth_has_field(fields.BoxListFields.is_crowd):
+        groundtruth[input_data_fields.groundtruth_is_crowd] = tf.stack(detection_model.groundtruth_lists(fields.BoxListFields.is_crowd))
+    groundtruth[input_data_fields.num_groundtruth_boxes] = tf.tile([max_number_of_boxes], multiples=[groundtruth_boxes_shape[0]])
+    return groundtruth
+
+def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
+    'Unstacks all tensors in `tensor_dict` along 0th dimension.\n\n  Unstacks tensor from the tensor dict along 0th dimension and returns a\n  tensor_dict containing values that are lists of unstacked, unpadded tensors.\n\n  Tensors in the `tensor_dict` are expected to be of one of the three shapes:\n  1. [batch_size]\n  2. [batch_size, height, width, channels]\n  3. [batch_size, num_boxes, d1, d2, ... dn]\n\n  When unpad_groundtruth_tensors is set to true, unstacked tensors of form 3\n  above are sliced along the `num_boxes` dimension using the value in tensor\n  field.InputDataFields.num_groundtruth_boxes.\n\n  Note that this function has a static list of input data fields and has to be\n  kept in sync with the InputDataFields defined in core/standard_fields.py\n\n  Args:\n    tensor_dict: A dictionary of batched groundtruth tensors.\n    unpad_groundtruth_tensors: Whether to remove padding along `num_boxes`\n      dimension of the groundtruth tensors.\n\n  Returns:\n    A dictionary where the keys are from fields.InputDataFields and values are\n    a list of unstacked (optionally unpadded) tensors.\n\n  Raises:\n    ValueError: If unpad_tensors is True and `tensor_dict` does not contain\n      `num_groundtruth_boxes` tensor.\n  '
+    unbatched_tensor_dict = {key: tf.unstack(tensor) for (key, tensor) in tensor_dict.items()}
+    if unpad_groundtruth_tensors:
+        if (fields.InputDataFields.num_groundtruth_boxes not in unbatched_tensor_dict):
+            raise ValueError('`num_groundtruth_boxes` not found in tensor_dict. Keys available: {}'.format(unbatched_tensor_dict.keys()))
+        unbatched_unpadded_tensor_dict = {}
+        unpad_keys = set([fields.InputDataFields.groundtruth_instance_masks, fields.InputDataFields.groundtruth_classes, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_keypoints, fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_area, fields.InputDataFields.groundtruth_weights]).intersection(set(unbatched_tensor_dict.keys()))
+        for key in unpad_keys:
+            unpadded_tensor_list = []
+            for (num_gt, padded_tensor) in zip(unbatched_tensor_dict[fields.InputDataFields.num_groundtruth_boxes], unbatched_tensor_dict[key]):
+                tensor_shape = shape_utils.combined_static_and_dynamic_shape(padded_tensor)
+                slice_begin = tf.zeros([len(tensor_shape)], dtype=tf.int32)
+                slice_size = tf.stack(([num_gt] + [((- 1) if (dim is None) else dim) for dim in tensor_shape[1:]]))
+                unpadded_tensor = tf.slice(padded_tensor, slice_begin, slice_size)
+                unpadded_tensor_list.append(unpadded_tensor)
+            unbatched_unpadded_tensor_dict[key] = unpadded_tensor_list
+        unbatched_tensor_dict.update(unbatched_unpadded_tensor_dict)
+    return unbatched_tensor_dict
+
+def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
+    'Creates a model function for `Estimator`.\n\n  Args:\n    detection_model_fn: Function that returns a `DetectionModel` instance.\n    configs: Dictionary of pipeline config objects.\n    hparams: `HParams` object.\n    use_tpu: Boolean indicating whether model should be constructed for\n        use on TPU.\n\n  Returns:\n    `model_fn` for `Estimator`.\n  '
+    train_config = configs['train_config']
+    eval_input_config = configs['eval_input_config']
+    eval_config = configs['eval_config']
+
+    def model_fn(features, labels, mode, params=None):
+        'Constructs the object detection model.\n\n    Args:\n      features: Dictionary of feature tensors, returned from `input_fn`.\n      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,\n        otherwise None.\n      mode: Mode key from tf.estimator.ModeKeys.\n      params: Parameter dictionary passed from the estimator.\n\n    Returns:\n      An `EstimatorSpec` that encapsulates the model and its serving\n        configurations.\n    '
+        params = (params or {})
+        (total_loss, train_op, detections, export_outputs) = (None, None, None, None)
+        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+        tf.keras.backend.set_learning_phase(is_training)
+        detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu))
+        scaffold_fn = None
+        
+        #数据预处理
+        if (mode == tf.estimator.ModeKeys.TRAIN):
+            labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
+        elif (mode == tf.estimator.ModeKeys.EVAL):
+            boxes_shape = labels[fields.InputDataFields.groundtruth_boxes].get_shape().as_list()
+            unpad_groundtruth_tensors = ((boxes_shape[1] is not None) and (not use_tpu))
+            labels = unstack_batch(labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)
+        if (mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL)):
+            gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
+            gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
+            gt_masks_list = None
+            if (fields.InputDataFields.groundtruth_instance_masks in labels):
+                gt_masks_list = labels[fields.InputDataFields.groundtruth_instance_masks]
+            gt_keypoints_list = None
+            if (fields.InputDataFields.groundtruth_keypoints in labels):
+                gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
+            gt_weights_list = None
+            if (fields.InputDataFields.groundtruth_weights in labels):
+                gt_weights_list = labels[fields.InputDataFields.groundtruth_weights]
+            gt_confidences_list = None
+            if (fields.InputDataFields.groundtruth_confidences in labels):
+                gt_confidences_list = labels[fields.InputDataFields.groundtruth_confidences]
+            gt_is_crowd_list = None
+            if (fields.InputDataFields.groundtruth_is_crowd in labels):
+                gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
+            detection_model.provide_groundtruth(groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_confidences_list=gt_confidences_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=gt_weights_list, groundtruth_is_crowd_list=gt_is_crowd_list)
+        preprocessed_images = features[fields.InputDataFields.image]
+        
+        #预测输出
+        if (use_tpu and train_config.use_bfloat16):
+            with tf.contrib.tpu.bfloat16_scope():
+                prediction_dict = detection_model.predict(preprocessed_images, features[fields.InputDataFields.true_image_shape])
+                for (k, v) in prediction_dict.items():
+                    if (v.dtype == tf.bfloat16):
+                        prediction_dict[k] = tf.cast(v, tf.float32)
+        else:
+            prediction_dict = detection_model.predict(preprocessed_images, features[fields.InputDataFields.true_image_shape])
+        
+        #后处理
+        if (mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT)):
+            detections = detection_model.postprocess(prediction_dict, features[fields.InputDataFields.true_image_shape])
+        if (mode == tf.estimator.ModeKeys.TRAIN):
+            if (train_config.fine_tune_checkpoint and hparams.load_pretrained):
+                if (not train_config.fine_tune_checkpoint_type):
+                    if train_config.from_detection_checkpoint:
+                        train_config.fine_tune_checkpoint_type = 'detection'
+                    else:
+                        train_config.fine_tune_checkpoint_type = 'classification'
+                asg_map = detection_model.restore_map(fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=train_config.load_all_detection_checkpoint_vars)
+                available_var_map = variables_helper.get_variables_available_in_checkpoint(asg_map, train_config.fine_tune_checkpoint, include_global_step=False)
+                if use_tpu:
+
+                    def tpu_scaffold():
+                        tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map)
+                        return tf.train.Scaffold()
+                    scaffold_fn = tpu_scaffold
+                else:
+                    tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map)
+        #loss计算
+        if (mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL)):
+            losses_dict = detection_model.loss(prediction_dict, features[fields.InputDataFields.true_image_shape])
+            losses = [loss_tensor for loss_tensor in losses_dict.values()]
+            if train_config.add_regularization_loss:
+                regularization_losses = detection_model.regularization_losses()
+                if regularization_losses:
+                    regularization_loss = tf.add_n(regularization_losses, name='regularization_loss')
+                    losses.append(regularization_loss)
+                    losses_dict['Loss/regularization_loss'] = regularization_loss
+            total_loss = tf.add_n(losses, name='total_loss')
+            losses_dict['Loss/total_loss'] = total_loss
+            if ('graph_rewriter_config' in configs):
+                graph_rewriter_fn = graph_rewriter_builder.build(configs['graph_rewriter_config'], is_training=is_training)
+                graph_rewriter_fn()
+            global_step = tf.train.get_or_create_global_step()
+            (training_optimizer, optimizer_summary_vars) = optimizer_builder.build(train_config.optimizer)
+            #loss scale
+            #training_optimizer = NPULossScaleOptimizer(training_optimizer, loss_scale_manager, is_distributed=True)
+
+
+        #训练场景
+        if (mode == tf.estimator.ModeKeys.TRAIN):
+            if use_tpu:
+                training_optimizer = npu_tf_optimizer(tf.contrib.tpu.CrossShardOptimizer(training_optimizer))
+            trainable_variables = None
+            include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None)
+            exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None)
+            trainable_variables = tf.contrib.framework.filter_variables(tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables)
+            clip_gradients_value = None
+            if (train_config.gradient_clipping_by_norm > 0):
+                clip_gradients_value = train_config.gradient_clipping_by_norm
+            if (not use_tpu):
+                for var in optimizer_summary_vars:
+                    tf.summary.scalar(var.op.name, var)
+            summaries = ([] if use_tpu else None)
+            if train_config.summarize_gradients:
+                summaries = ['gradients', 'gradient_norm', 'global_gradient_norm']
+            #add 
+            print("[DEBUG in model_lib] enter optimize_loss,total_loss:",total_loss)
+
+            train_op = tf.contrib.layers.optimize_loss(loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='')
+
+
+        #预测场景
+        if (mode == tf.estimator.ModeKeys.PREDICT):
+            exported_output = exporter_lib.add_output_tensor_nodes(detections)
+            export_outputs = {tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output)}
+        eval_metric_ops = None
+        scaffold = None
+        
+        #eval场景
+        if (mode == tf.estimator.ModeKeys.EVAL):
+            class_agnostic = (fields.DetectionResultFields.detection_classes not in detections)
+            groundtruth = _prepare_groundtruth_for_eval(detection_model, class_agnostic, eval_input_config.max_number_of_boxes)
+            use_original_images = (fields.InputDataFields.original_image in features)
+            if use_original_images:
+                eval_images = features[fields.InputDataFields.original_image]
+                true_image_shapes = tf.slice(features[fields.InputDataFields.true_image_shape], [0, 0], [(- 1), 3])
+                original_image_spatial_shapes = features[fields.InputDataFields.original_image_spatial_shape]
+            else:
+                eval_images = features[fields.InputDataFields.image]
+                true_image_shapes = None
+                original_image_spatial_shapes = None
+            eval_dict = eval_util.result_dict_for_batched_example(eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes)
+            if class_agnostic:
+                category_index = label_map_util.create_class_agnostic_category_index()
+            else:
+                category_index = label_map_util.create_category_index_from_labelmap(eval_input_config.label_map_path)
+            vis_metric_ops = None
+            if ((not use_tpu) and use_original_images):
+                eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False)
+                vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(eval_dict)
+            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(eval_config, list(category_index.values()), eval_dict)
+            for (loss_key, loss_tensor) in iter(losses_dict.items()):
+                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
+            for var in optimizer_summary_vars:
+                eval_metric_ops[var.op.name] = (var, tf.no_op())
+            if (vis_metric_ops is not None):
+                eval_metric_ops.update(vis_metric_ops)
+            eval_metric_ops = {str(k): v for (k, v) in eval_metric_ops.items()}
+            if eval_config.use_moving_averages:
+                variable_averages = tf.train.ExponentialMovingAverage(0.0)
+                variables_to_restore = variable_averages.variables_to_restore()
+                keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
+                saver = tf.train.Saver(variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
+                scaffold = tf.train.Scaffold(saver=saver)
+        
+        #训练实例
+        if (use_tpu and (mode != tf.estimator.ModeKeys.EVAL)):
+            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs)
+        
+        else:
+            if (scaffold is None):
+                keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
+                saver = tf.train.Saver(sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True)
+                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
+                scaffold = tf.train.Scaffold(saver=saver)
+            return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
+    return model_fn
+
+def create_estimator_and_inputs(run_config, hparams, pipeline_config_path, eval_count=1, config_override=None, train_steps=None, sample_1_of_n_eval_examples=1, sample_1_of_n_eval_on_train_examples=1, model_fn_creator=create_model_fn, use_tpu_estimator=False, use_tpu=False, num_shards=1, params=None, override_eval_num_epochs=True, save_final_config=False, **kwargs):
+    "Creates `Estimator`, input functions, and steps.\n\n  Args:\n    run_config: A `RunConfig`.\n    hparams: A `HParams`.\n    pipeline_config_path: A path to a pipeline config file.\n    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to\n      override the config from `pipeline_config_path`.\n    train_steps: Number of training steps. If None, the number of training steps\n      is set from the `TrainConfig` proto.\n    sample_1_of_n_eval_examples: Integer representing how often an eval example\n      should be sampled. If 1, will sample all examples.\n    sample_1_of_n_eval_on_train_examples: Similar to\n      `sample_1_of_n_eval_examples`, except controls the sampling of training\n      data for evaluation.\n    model_fn_creator: A function that creates a `model_fn` for `Estimator`.\n      Follows the signature:\n\n      * Args:\n        * `detection_model_fn`: Function that returns `DetectionModel` instance.\n        * `configs`: Dictionary of pipeline config objects.\n        * `hparams`: `HParams` object.\n      * Returns:\n        `model_fn` for `Estimator`.\n\n    use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False,\n      an `Estimator` will be returned.\n    use_tpu: Boolean, whether training and evaluation should run on TPU. Only\n      used if `use_tpu_estimator` is True.\n    num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator`\n      is True.\n    params: Parameter dictionary passed from the estimator. Only used if\n      `use_tpu_estimator` is True.\n    override_eval_num_epochs: Whether to overwrite the number of epochs to\n      1 for eval_input.\n    save_final_config: Whether to save final config (obtained after applying\n      overrides) to `estimator.model_dir`.\n    **kwargs: Additional keyword arguments for configuration override.\n\n  Returns:\n    A dictionary with the following fields:\n    'estimator': An `Estimator` or `TPUEstimator`.\n    'train_input_fn': A training input function.\n    'eval_input_fns': A list of all evaluation input functions.\n    'eval_input_names': A list of names for each evaluation input.\n    'eval_on_train_input_fn': An evaluation-on-train input function.\n    'predict_input_fn': A prediction input function.\n    'train_steps': Number of training steps. Either directly from input or from\n      configuration.\n    'train_batch_size': train batch size per GPU\n  "
+    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP['get_configs_from_pipeline_file']
+    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP['merge_external_params_with_configs']
+    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP['create_pipeline_proto_from_configs']
+    create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn']
+    create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn']
+    create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn']
+    configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override)
+    kwargs.update({'train_steps': train_steps, 'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples})
+    if override_eval_num_epochs:
+        kwargs.update({'eval_num_epochs': 1})
+        tf.logging.warning('Forced number of epochs for all eval validations to be 1.')
+    configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs)
+    model_config = configs['model']
+    train_config = configs['train_config']
+    train_input_config = configs['train_input_config']
+    eval_config = configs['eval_config']
+    eval_input_configs = configs['eval_input_configs']
+    eval_on_train_input_config = copy.deepcopy(train_input_config)
+    eval_on_train_input_config.sample_1_of_n_examples = sample_1_of_n_eval_on_train_examples
+    if (override_eval_num_epochs and (eval_on_train_input_config.num_epochs != 1)):
+        tf.logging.warning('Expected number of evaluation epochs is 1, but instead encountered `eval_on_train_input_config.num_epochs` = {}. Overwriting `num_epochs` to 1.'.format(eval_on_train_input_config.num_epochs))
+        eval_on_train_input_config.num_epochs = 1
+    if ((train_steps is None) and (train_config.num_steps != 0)):
+        train_steps = train_config.num_steps
+    detection_model_fn = functools.partial(model_builder.build, model_config=model_config)
+    train_input_fn = create_train_input_fn(train_config=train_config, train_input_config=train_input_config, model_config=model_config)
+    eval_input_fns = [create_eval_input_fn(eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) for eval_input_config in eval_input_configs]
+    eval_input_names = [eval_input_config.name for eval_input_config in eval_input_configs]
+    eval_on_train_input_fn = create_eval_input_fn(eval_config=eval_config, eval_input_config=eval_on_train_input_config, model_config=model_config)
+    predict_input_fn = create_predict_input_fn(model_config=model_config, predict_input_config=eval_input_configs[0])
+    export_to_tpu = hparams.get('export_to_tpu', False)
+    tf.logging.info('create_estimator_and_inputs: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu)
+    model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu)
+    custom_op = run_config.session_config.graph_options.rewrite_options.custom_optimizers.add()
+    custom_op.name = 'NpuOptimizer'
+    custom_op.parameter_map['precision_mode'].s = tf.compat.as_bytes("allow_mix_precision")
+    custom_op.parameter_map['mix_compile_mode'].b = True
+    # custom_op.parameter_map["dynamic_input"].b = True
+    # custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile")
+    custom_op.parameter_map["hcom_parallel"].b = True
+    custom_op.parameter_map["jit_compile"].b = False
+
+    run_config = tf.estimator.RunConfig(model_dir=run_config.model_dir, session_config=run_config.session_config, 
+                              save_checkpoints_steps=train_steps // eval_count)
+    if use_tpu_estimator:
+        estimator = tf.contrib.tpu.TPUEstimator(model_fn=model_fn, train_batch_size=train_config.batch_size, eval_batch_size=((num_shards * 1) if use_tpu else 1), use_tpu=False, config=run_config, params=(params if params else {}), eval_on_tpu=False, export_to_tpu=False)
+    else:
+        estimator = tf.estimator.Estimator(model_fn=model_fn, config=npu_run_config_init(run_config=run_config))
+    if (run_config.is_chief and save_final_config):
+        pipeline_config_final = create_pipeline_proto_from_configs(configs)
+        config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir)
+    return dict(estimator=estimator, train_input_fn=train_input_fn, eval_input_fns=eval_input_fns, eval_input_names=eval_input_names, eval_on_train_input_fn=eval_on_train_input_fn, predict_input_fn=predict_input_fn, train_steps=train_steps, train_batch_size=train_config.batch_size)
+
+def create_train_and_eval_specs(train_input_fn, eval_input_fns, eval_on_train_input_fn, predict_input_fn, train_steps, eval_on_train_data=False, final_exporter_name='Servo', eval_spec_names=None):
+    'Creates a `TrainSpec` and `EvalSpec`s.\n\n  Args:\n    train_input_fn: Function that produces features and labels on train data.\n    eval_input_fns: A list of functions that produce features and labels on eval\n      data.\n    eval_on_train_input_fn: Function that produces features and labels for\n      evaluation on train data.\n    predict_input_fn: Function that produces features for inference.\n    train_steps: Number of training steps.\n    eval_on_train_data: Whether to evaluate model on training data. Default is\n      False.\n    final_exporter_name: String name given to `FinalExporter`.\n    eval_spec_names: A list of string names for each `EvalSpec`.\n\n  Returns:\n    Tuple of `TrainSpec` and list of `EvalSpecs`. If `eval_on_train_data` is\n    True, the last `EvalSpec` in the list will correspond to training data. The\n    rest EvalSpecs in the list are evaluation datas.\n  '
+    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=(train_steps // get_rank_size()), hooks=[NpuEmptyHook()])
+    if (eval_spec_names is None):
+        eval_spec_names = [str(i) for i in range(len(eval_input_fns))]
+    eval_specs = []
+    for (index, (eval_spec_name, eval_input_fn)) in enumerate(zip(eval_spec_names, eval_input_fns)):
+        if (index == 0):
+            exporter_name = final_exporter_name
+        else:
+            exporter_name = '{}_{}'.format(final_exporter_name, eval_spec_name)
+        exporter = tf.estimator.FinalExporter(name=exporter_name, serving_input_receiver_fn=predict_input_fn)
+        eval_specs.append(tf.estimator.EvalSpec(name=eval_spec_name, input_fn=eval_input_fn, steps=None, exporters=exporter,hooks=npu_hooks_append()))
+    if eval_on_train_data:
+        eval_specs.append(tf.estimator.EvalSpec(name='eval_on_train', input_fn=eval_on_train_input_fn, steps=None,hooks=npu_hooks_append()))
+    return (train_spec, eval_specs)
+
+def continuous_eval(estimator, model_dir, input_fn, train_steps, name):
+    'Perform continuous evaluation on checkpoints written to a model directory.\n\n  Args:\n    estimator: Estimator object to use for evaluation.\n    model_dir: Model directory to read checkpoints for continuous evaluation.\n    input_fn: Input function to use for evaluation.\n    train_steps: Number of training steps. This is used to infer the last\n      checkpoint and stop evaluation loop.\n    name: Namescope for eval summary.\n  '
+
+    def terminate_eval():
+        tf.logging.info('Terminating eval after 180 seconds of no checkpoints')
+        return True
+    for ckpt in tf.contrib.training.checkpoints_iterator(model_dir, min_interval_secs=180, timeout=None, timeout_fn=terminate_eval):
+        tf.logging.info('Starting Evaluation.')
+        try:
+            eval_results = estimator.evaluate(input_fn=input_fn, steps=None, checkpoint_path=ckpt, name=name)
+            tf.logging.info(('Eval results: %s' % eval_results))
+            current_step = int(os.path.basename(ckpt).split('-')[1])
+            if (current_step >= train_steps):
+                tf.logging.info(('Evaluation finished after training step %d' % current_step))
+                break
+        except tf.errors.NotFoundError:
+            tf.logging.info(('Checkpoint %s no longer exists, skipping checkpoint' % ckpt))
+
+def populate_experiment(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, model_fn_creator=create_model_fn, **kwargs):
+    'Populates an `Experiment` object.\n\n  EXPERIMENT CLASS IS DEPRECATED. Please switch to\n  tf.estimator.train_and_evaluate. As an example, see model_main.py.\n\n  Args:\n    run_config: A `RunConfig`.\n    hparams: A `HParams`.\n    pipeline_config_path: A path to a pipeline config file.\n    train_steps: Number of training steps. If None, the number of training steps\n      is set from the `TrainConfig` proto.\n    eval_steps: Number of evaluation steps per evaluation cycle. If None, the\n      number of evaluation steps is set from the `EvalConfig` proto.\n    model_fn_creator: A function that creates a `model_fn` for `Estimator`.\n      Follows the signature:\n\n      * Args:\n        * `detection_model_fn`: Function that returns `DetectionModel` instance.\n        * `configs`: Dictionary of pipeline config objects.\n        * `hparams`: `HParams` object.\n      * Returns:\n        `model_fn` for `Estimator`.\n\n    **kwargs: Additional keyword arguments for configuration override.\n\n  Returns:\n    An `Experiment` that defines all aspects of training, evaluation, and\n    export.\n  '
+    tf.logging.warning('Experiment is being deprecated. Please use tf.estimator.train_and_evaluate(). See model_main.py for an example.')
+    train_and_eval_dict = create_estimator_and_inputs(run_config, hparams, pipeline_config_path, train_steps=train_steps, eval_steps=eval_steps, model_fn_creator=model_fn_creator, save_final_config=True, **kwargs)
+    estimator = train_and_eval_dict['estimator']
+    train_input_fn = train_and_eval_dict['train_input_fn']
+    eval_input_fns = train_and_eval_dict['eval_input_fns']
+    predict_input_fn = train_and_eval_dict['predict_input_fn']
+    train_steps = train_and_eval_dict['train_steps']
+    export_strategies = [tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(serving_input_fn=predict_input_fn)]
+    return tf.contrib.learn.Experiment(estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fns[0], train_steps=train_steps, eval_steps=None, export_strategies=export_strategies, eval_delay_secs=120)
diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_main_rt.py b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_main_rt.py
new file mode 100644
index 0000000000000000000000000000000000000000..46bcfb668728927670383d130dfa5d004904f09e
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_main_rt.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+'Binary to run train and evaluation on object detection model.'
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from npu_bridge.npu_init import *
+from tensorflow.core.protobuf import config_pb2
+from absl import flags
+import tensorflow as tf
+#import horovod.tensorflow as hvd
+import dllogger
+import time
+import os
+from object_detection import model_hparams
+from object_detection import model_lib_rt
+from object_detection.utils.exp_utils import AverageMeter, setup_dllogger
+
+flags.DEFINE_string('model_dir', None, 'Path to output model directory where event and checkpoint files will be written.')
+flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config file.')
+flags.DEFINE_string('raport_file', default='summary.json', help='Path to dlloger json')
+flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
+flags.DEFINE_boolean('eval_training_data', False, 'If training data should be evaluated for this job. Note that one call only use this in eval-only mode, and `checkpoint_dir` must be supplied.')
+flags.DEFINE_integer('sample_1_of_n_eval_examples', 1, 'Will sample one of every n eval input examples, where n is provided.')
+flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample one of every n train input examples for evaluation, where n is provided. This is only used if `eval_training_data` is True.')
+flags.DEFINE_integer('eval_count', 1, 'How many times the evaluation should be run')
+flags.DEFINE_string('hparams_overrides', None, 'Hyperparameter overrides, represented as a string containing comma-separated hparam_name=value pairs.')
+flags.DEFINE_string('checkpoint_dir', None, 'Path to directory holding a checkpoint.  If `checkpoint_dir` is provided, this binary operates in eval-only mode, writing resulting metrics to `model_dir`.')
+flags.DEFINE_boolean('allow_xla', False, 'Enable XLA compilation')
+flags.DEFINE_boolean('amp', False, 'Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.')
+flags.DEFINE_boolean('run_once', False, 'If running in eval-only mode, whether to run just one round of eval vs running continuously (default).')
+############################NPU_modify add########################################
+flags.DEFINE_boolean('overflow_dump', False, 'Enable overflow op detection')
+flags.DEFINE_string('overflow_dump_path', None, 'Path to directory dump overflow ops data.')
+flags.DEFINE_boolean('check_loss_scale', False, 'check whether loss scale is valid')
+flags.DEFINE_boolean('step_dump', False, 'Enable dump step data, can only set when overflow_dump is not set')
+flags.DEFINE_string('step_dump_path', None, 'Path to directory dump step0 ops data.')
+flags.DEFINE_boolean('skip_eval', False, 'Whether to skip eval')
+############################NPU_modify end########################################
+FLAGS = flags.FLAGS
+
+
+class DLLoggerHook(tf.estimator.SessionRunHook):
+
+    def __init__(self, global_batch_size, rank=(- 1)):
+        self.global_batch_size = global_batch_size
+        self.rank = rank
+        setup_dllogger(enabled=True, filename=FLAGS.raport_file, rank=rank)
+
+    def after_create_session(self, session, coord):
+        self.meters = {}
+        warmup = 100
+        self.meters['train_throughput'] = AverageMeter(warmup=warmup)
+
+    def before_run(self, run_context):
+        self.t0 = time.time()
+        return tf.estimator.SessionRunArgs(fetches=['global_step:0', 'learning_rate:0'])
+
+    def after_run(self, run_context, run_values):
+        throughput = (self.global_batch_size / (time.time() - self.t0))
+        (global_step, lr) = run_values.results
+        self.meters['train_throughput'].update(throughput)
+
+    def end(self, session):
+        summary = {'train_throughput': self.meters['train_throughput'].avg}
+        dllogger.log(step=tuple(), data=summary)
+
+###############################NPU_modify add#####################################
+class _LogSessionRunHook(tf.train.SessionRunHook):
+    def before_run(self, run_context):
+        return tf.estimator.SessionRunArgs(fetches=['overflow_status_reduce_all:0', 'loss_scale:0'])
+
+    def after_run(self, run_context, run_values):
+        if not run_values.results[0]:
+            print('Find overflow in this step, skip apply gradients, loss scale value=%d' % run_values.results[1],flush=True)
+        else:
+            print('Apply gradients, loss scale value=%d' % run_values.results[1],flush=True)
+###############################NPU_modify end#####################################
+def main(unused_argv):
+    tf.logging.set_verbosity(tf.logging.INFO)
+    #tf的混合精度
+    if FLAGS.amp:
+        os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+    else:
+        os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '0'
+    flags.mark_flag_as_required('model_dir')
+    flags.mark_flag_as_required('pipeline_config_path')
+    if True:
+        session_config = npu_config_proto(config_proto=tf.ConfigProto())
+ 
+    session_config.gpu_options.per_process_gpu_memory_fraction = 0.9
+    session_config.gpu_options.visible_device_list = str(get_npu_local_rank_id())
+    if FLAGS.allow_xla:
+        if True:
+            session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+    model_dir = (FLAGS.model_dir if (get_npu_rank_id() == 0) else None)
+    config = tf.estimator.RunConfig(model_dir=model_dir, session_config=session_config)
+    
+    train_and_eval_dict = model_lib.create_estimator_and_inputs(run_config=config, eval_count=FLAGS.eval_count, hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), pipeline_config_path=FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=FLAGS.sample_1_of_n_eval_on_train_examples)
+    estimator = train_and_eval_dict['estimator']
+    train_input_fn = train_and_eval_dict['train_input_fn']
+    eval_input_fns = train_and_eval_dict['eval_input_fns']
+    eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
+    predict_input_fn = train_and_eval_dict['predict_input_fn']
+    train_steps = train_and_eval_dict['train_steps']
+    if FLAGS.checkpoint_dir:
+        if FLAGS.eval_training_data:
+            name = 'training_data'
+            input_fn = eval_on_train_input_fn
+        else:
+            name = 'validation_data'
+            input_fn = eval_input_fns[0]
+        #if FLAGS.run_once:
+        #    estimator.evaluate(input_fn, steps=None, checkpoint_path=tf.train.latest_checkpoint(FLAGS.checkpoint_dir))
+        #else:
+        #    model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn, train_steps, name)
+    else:
+        (train_spec, eval_specs) = model_lib.create_train_and_eval_specs(train_input_fn, eval_input_fns, eval_on_train_input_fn, predict_input_fn, train_steps, eval_on_train_data=False)
+        ##################################NPU_modify add###################################
+        if FLAGS.check_loss_scale:
+            train_hooks = [NpuEmptyHook(), DLLoggerHook((get_rank_size() * train_and_eval_dict['train_batch_size']), get_npu_rank_id()),_LogSessionRunHook()]
+        else:
+            train_hooks = [NpuEmptyHook(), DLLoggerHook((get_rank_size() * train_and_eval_dict['train_batch_size']), get_npu_rank_id())]      
+        #train_hooks = [NpuEmptyHook(), DLLoggerHook((get_rank_size() * train_and_eval_dict['train_batch_size']), get_rank_id())]
+        ##################################NPU_modify end###################################
+        eval_hooks = []
+        for x in range(FLAGS.eval_count):
+            estimator.train(train_input_fn, hooks=npu_hooks_append(hooks_list=train_hooks), steps=(train_steps // FLAGS.eval_count))
+            if (get_npu_rank_id() == 0):
+                eval_input_fn = eval_input_fns[0]
+                #eval阻塞，临时规避
+                if FLAGS.skip_eval:
+                  print("[debug]skip eval.")
+                else:
+                  print("[debug]enter eval process ...")
+                  results = estimator.evaluate(eval_input_fn, steps=None, hooks=eval_hooks)  
+
+if (__name__ == '__main__'):
+    session_config = tf.ConfigProto()
+    custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add()
+    custom_op.name = "NpuOptimizer"
+    custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
+    # custom_op.parameter_map["mix_compile_mode"].b = True
+    custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("../../configs/ops_info.json")
+    (npu_sess, npu_shutdown) = init_resource(config=session_config)
+    tf.app.run()
+    shutdown_resource(npu_sess, npu_shutdown)
+    close_session(npu_sess)
diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_1p_inceptionv2.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_1p_inceptionv2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b70f401d407481b1c238f4afbd3efe7faf6688ce
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_1p_inceptionv2.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+cur_path=`pwd`
+export PYTHONPATH=$cur_path/../models/research:$cur_path/../models/research/slim:$PYTHONPATH
+
+#集合通信
+export RANK_SIZE=1
+export RANK_TABLE_FILE=$cur_path/../configs/${RANK_SIZE}p_${ASCEND_DEVICE_ID}.json
+export JOB_ID=10087
+RANK_ID_START=0
+
+#使能RT2.0
+export ENABLE_RUNTIME_V2=1
+
+#数据集参数
+data_path="/data"
+use_conda=0
+
+#训练参数，需要根据模型修改
+Network="SSD-InceptionV2_ID0510_for_TensorFlow"
+num_train_steps=300
+batch_size=24
+ckpt_path=/checkpoints
+pipeline_config=$cur_path/../models/research/configs/ssd_inception_v2_coco_1p.config
+
+
+#帮助提示，需要根据网络修改
+if [[ $1 == --help || $1 == -h ]];then 
+	echo "usage: ./train_performance_1p.sh <args>"
+
+	echo ""
+	echo "parameter explain:
+	--num_train_steps           training steps
+	--data_path                 source data of training
+	--ckpt_path                  pre-checkpoint path
+	--pipeline_config           pipeline config path
+	--skip_eval  whether to skip eval
+    -h/--help             Show help message
+	"
+	exit 1
+fi
+
+#入参设置，需要根据网络修改
+for para in $*
+do
+    if [[ $para == --num_train_steps* ]];then
+		num_train_steps=`echo ${para#*=}`
+	elif [[ $para == --data_path* ]];then
+		data_path=`echo ${para#*=}`
+	elif [[ $para == --ckpt_path* ]];then
+		ckpt_path=`echo ${para#*=}`
+	elif [[ $para == --pipeline_config* ]];then
+		pipeline_config=`echo ${para#*=}`
+    elif [[ $para == --use_conda* ]];then
+	    use_conda=`echo ${para#*=}`
+	elif [[ $para == --skip_eval* ]];then
+	    skip_eval=`echo ${para#*=}`
+    fi
+done	
+
+if [[ $data_path == "" ]];then
+	echo "[Error] para \"data_path\" must be config"
+	exit 1
+fi
+
+
+
+##########################执行训练#########################
+start_time=$(date +%s)
+cd $cur_path/../models/research
+if [  -f ${pipeline_config}.bak ];then
+   cp ${pipeline_config}.bak ${pipeline_config}
+else
+   cp ${pipeline_config} ${pipeline_config}.bak
+fi
+
+sed -i "s%/checkpoints%${ckpt_path}%p" ${pipeline_config} 
+sed -i "s%/data/coco2017_tfrecords%${data_path}/coco2017_tfrecords%p" ${pipeline_config} 
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+ do
+  echo "Device ID: $ASCEND_DEVICE_ID"
+  export RANK_ID=$RANK_ID
+  if [   -d $cur_path/output/${ASCEND_DEVICE_ID} ];then
+     rm -rf $cur_path/output/${ASCEND_DEVICE_ID}
+     mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
+  else
+     mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
+  fi
+
+#训练执行脚本，需要根据网络修改
+  nohup python3 -u ./object_detection/model_main_rt.py \
+       --pipeline_config_path=${pipeline_config} \
+       --model_dir=$cur_path/output/${ASCEND_DEVICE_ID} \
+       --data_path=${data_path}   \
+       --overflow_dump_path=${overflow_dump_path}   \
+       --step_dump_path=${step_dump_path}   \
+       --alsologtostder \
+       --amp \
+       --num_train_steps=${num_train_steps}  \
+       --skip_eval=True \
+       "${@:1}"  > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+done
+wait
+
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+##########################业务日志#########################
+grep ERROR $HOME/ascend/log/plog/*.log > $cur_path/output/$ASCEND_DEVICE_ID/plog_err.log
+
+###########################性能结果处理#########################
+echo "-----------------------Final result------------------------"
+#性能FPS计算，需要根据网络修改
+#FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $2}'` 
+FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'NR==2{print $2}'` 
+
+FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*'${FPS}'}'`
+echo "Final Performance images/sec : $FPS"
+
+################################E2E训练时长##########################
+echo "Final Training Duration sec : $e2e_time"
+
+################################性能看护#############################
+DeviceType=`uname -m`
+CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'RT2'_'perf'
+ActualFPS=${FPS}
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型修改
+grep INFO:tensorflow:loss $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt
+
+ActualLoss=`awk 'END {print}' $cur_path/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt`
+echo "Network = ${Network}" > $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "BatchSize = ${batch_size}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+
diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_1p_mobilenetv2.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_1p_mobilenetv2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b7251d32828716906667d7b9f564079d1fe45d50
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_1p_mobilenetv2.sh
@@ -0,0 +1,173 @@
+#!bin/bash
+cur_path=`pwd`
+
+#环境设置，需要根据网络修改
+export PYTHONPATH=$cur_path/../models/research:$cur_path/../models/research/slim:$PYTHONPATH
+
+#集合通信
+export RANK_SIZE=1
+export RANK_TABLE_FILE=$cur_path/../configs/${RANK_SIZE}p_${ASCEND_DEVICE_ID}.json
+export JOB_ID=10087
+RANK_ID_START=0
+
+#使能RT2.0
+export ENABLE_RUNTIME_V2=1
+
+#数据集参数
+data_path="/data"
+use_conda=0
+
+#训练参数，需要根据模型修改
+Network="SSD-MobilenetV2_ID0499_for_TensorFlow"
+num_train_steps=1000
+batch_size=24
+ckpt_path=/checkpoints
+pipeline_config=$cur_path/../models/research/configs/ssd_mobilenet_v2_coco_1p.config
+
+#维测参数
+overflow_dump=False
+overflow_dump_path=$cur_path/output/overflow_dump
+step_dump=False
+step_dump_path=$cur_path/output/step_dump
+check_loss_scale=Flase
+
+#帮助提示，需要根据网络修改
+if [[ $1 == --help || $1 == -h ]];then 
+    echo "usage: ./train_performance_1p.sh <args>"
+    echo ""
+    echo "parameter explain:
+    --num_train_steps           training steps
+    --data_path                 source data of training
+    --ckpt_path                  pre-checkpoint path
+    --pipeline_config           pipeline config path
+    --overflow_dump        overflow detection，default is False
+    --overflow_dump_path   overflow dump path
+    --check_loss_scale     check whether loss scale is valid, default is False
+    --step_dump            Dump step data, default is False, can only set when overflow_dump is False
+    --step_dump_path      step_dump_path
+    -h/--help             Show help message
+    "
+    exit 1
+fi
+
+#入参设置，需要根据网络修改
+for para in $*
+do
+    if [[ $para == --num_train_steps* ]];then
+        num_train_steps=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --ckpt_path* ]];then
+        ckpt_path=`echo ${para#*=}`
+    elif [[ $para == --pipeline_config* ]];then
+        pipeline_config=`echo ${para#*=}`
+    elif [[ $para == --overflow_dump* ]];then
+        overflow_dump=`echo ${para#*=}`
+        if [  -d ${overflow_dump_path}  ];then
+            echo "overflow dump path: ${overflow_dump_path}"
+        else
+            mkdir -p ${overflow_dump_path}
+        fi
+    elif [[ $para == --check_loss_scale* ]];then
+        check_loss_scale=`echo ${para#*=}`
+    elif [[ $para == --step_dump* ]];then
+        step_dump=`echo ${para#*=}`
+        if [  -d ${step_dump_path}  ];then
+            echo "step dump path: ${step_dump_path}"
+        else
+            mkdir -p ${step_dump_path}
+        fi
+    elif [[ $para == --use_conda* ]];then
+	    use_conda=`echo ${para#*=}`
+    fi
+done
+
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be config"
+    exit 1
+fi
+
+##########################执行训练#########################
+start_time=$(date +%s)
+cd $cur_path/../models/research
+if [  -f ${pipeline_config}.bak ];then
+   cp ${pipeline_config}.bak ${pipeline_config}
+else
+   cp ${pipeline_config} ${pipeline_config}.bak
+fi
+
+# 更改参数
+sed -i "s%/data/coco2017_tfrecords%${data_path}/coco2017_tfrecords%p" ${pipeline_config}
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+    do
+    echo "Device ID: $ASCEND_DEVICE_ID"
+    export RANK_ID=$RANK_ID
+    if [   -d $cur_path/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf $cur_path/output/${ASCEND_DEVICE_ID}
+        mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
+    else
+        mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
+    fi
+
+    #训练执行脚本，需要根据网络修改
+    nohup python3 -u ./object_detection/model_main_rt.py \
+        --pipeline_config_path=${pipeline_config} \
+        --model_dir=$cur_path/output/${ASCEND_DEVICE_ID}/npu_ckpt_mobilenetv2_${RANK_SIZE}p\
+        --data_path=${data_path}   \
+        --overflow_dump_path=${overflow_dump_path}   \
+        --step_dump_path=${step_dump_path}   \
+        --alsologtostder \
+        --amp \
+        --num_train_steps=${num_train_steps}  \
+        --skip_eval=True \
+        "${@:1}"  > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+done
+wait
+
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+echo "Final Training Duration sec : $e2e_time"
+
+# 参数回改
+sed -i "s%${data_path}/coco2017_tfrecords%/data/coco2017_tfrecords%p" ${pipeline_config}
+
+
+################################性能结果处理#########################
+echo "-----------------------Final result------------------------"
+# 性能FPS计算，需要根据网络修改
+step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $2}'|tail -2|head -n 1`
+FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${step_sec}'}'`
+echo "Final Performance images/sec : ${FPS}"
+
+#################################精度结果处理#########################
+# 精度计算，需要根据网络修改
+train_accuracy=`grep Precision $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'NR==1 {print $13}'`
+echo "Final Training Accuracy mAP: ${train_accuracy}"
+
+#################################性能看护#############################
+# 训练用例信息，不需要修改
+DeviceType=`uname -m`
+BatchSize=${batch_size}
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'perf'
+ActualFPS=${FPS}
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'`
+
+#################################Loss#########################
+# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型修改
+grep INFO:tensorflow:loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+ActualLoss=`awk 'END {print}' train_loss.txt`
+# eval版本需求开发中，精度结果临时看护最终的loss
+echo "Final Training Accuracy loss: ${ActualLoss}"
+
+# 关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_8p_inceptionv2.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_8p_inceptionv2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..69611f27b45cf91737c97f8ee77f688665c9dafa
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_8p_inceptionv2.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+cur_path=`pwd`
+export PYTHONPATH=$cur_path/../models/research:$cur_path/../models/research/slim:$PYTHONPATH
+export HCCL_CONNECT_TIMEOUT=200
+#集合通信
+export RANK_SIZE=8
+export RANK_TABLE_FILE=$cur_path/../configs/${RANK_SIZE}p.json
+export JOB_ID=10087
+RANK_ID_START=0
+ASCEND_DEVICE_ID_START=0
+
+#使能RT2.0
+export ENABLE_RUNTIME_V2=1
+
+#数据集参数
+data_path="/data"
+use_conda=0
+
+#训练参数，需要根据模型修改
+Network="SSD-InceptionV2_ID0510_for_TensorFlow"
+num_train_steps=300
+batch_size=24
+ckpt_path=/checkpoints
+pipeline_config=$cur_path/../models/research/configs/ssd_inception_v2_coco_8p.config
+
+#帮助提示，需要根据网络修改
+if [[ $1 == --help || $1 == -h ]];then 
+	echo "usage: ./train_performance_8p_inceptionv2.sh <args>"
+
+	echo ""
+	echo "parameter explain:
+	--num_train_steps           training steps
+	--data_path                 source data of training
+	--ckpt_path                  pre-checkpoint path
+	--pipeline_config           pipeline config path
+	--skip_eval  whether to skip eval
+    -h/--help             Show help message
+	"
+	exit 1
+fi
+
+#入参设置，需要根据网络修改
+for para in $*
+do
+    if [[ $para == --num_train_steps* ]];then
+		num_train_steps=`echo ${para#*=}`
+	elif [[ $para == --data_path* ]];then
+		data_path=`echo ${para#*=}`
+	elif [[ $para == --ckpt_path* ]];then
+		ckpt_path=`echo ${para#*=}`
+	elif [[ $para == --pipeline_config* ]];then
+		pipeline_config=`echo ${para#*=}`
+    elif [[ $para == --use_conda* ]];then
+	    use_conda=`echo ${para#*=}`
+	elif [[ $para == --skip_eval* ]];then
+	    skip_eval=`echo ${para#*=}`
+    fi
+done	
+
+if [[ $data_path == "" ]];then
+	echo "[Error] para \"data_path\" must be config"
+	exit 1
+fi
+
+
+##########################执行训练#########################
+start_time=$(date +%s)
+cd $cur_path/../models/research
+if [  -f ${pipeline_config}.bak ];then
+   cp ${pipeline_config}.bak ${pipeline_config}
+else
+   cp ${pipeline_config} ${pipeline_config}.bak
+fi
+
+sed -i "s%/checkpoints%${ckpt_path}%p" ${pipeline_config} 
+sed -i "s%/data/coco2017_tfrecords%${data_path}/coco2017_tfrecords%p" ${pipeline_config} 
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+ do
+  export RANK_ID=$RANK_ID
+  export ASCEND_DEVICE_ID=$((ASCEND_DEVICE_ID_START+RANK_ID))
+  echo "Device ID: $ASCEND_DEVICE_ID"
+  if [   -d $cur_path/output/${ASCEND_DEVICE_ID} ];then
+     rm -rf $cur_path/output/${ASCEND_DEVICE_ID}
+     mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
+  else
+     mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
+  fi
+
+#训练执行脚本，需要根据网络修改
+  nohup python3 -u ./object_detection/model_main_rt.py \
+       --pipeline_config_path=${pipeline_config} \
+       --model_dir=$cur_path/output/${ASCEND_DEVICE_ID} \
+       --data_path=${data_path}   \
+       --alsologtostder \
+       --amp \
+       --num_train_steps=${num_train_steps}  \
+	   --skip_eval=True \
+       "${@:1}"  > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+done
+wait
+
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+ASCEND_DEVICE_ID=0
+
+
+##########################业务日志#########################
+grep ERROR $HOME/ascend/log/plog/*.log > $cur_path/output/${ASCEND_DEVICE_ID}/plog_err.log
+
+################################性能结果处理#########################
+echo "-----------------------Final result------------------------"
+#性能FPS计算，需要根据网络修改
+#FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $2}'` 
+FPS=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'NR==2{print $2}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*'${FPS}'}'`
+echo "Final Performance images/sec : $FPS"
+################################精度结果处理#########################
+#精度计算，需要根据网络修改
+train_accuracy=`grep Precision $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep Average |awk 'NR==1 {print $13}'`
+
+#echo 'Final Training Accuracy mAP: $train_accuracy'
+################################E2E训练时长##########################
+echo "Final Training Duration sec : $e2e_time"
+
+################################性能看护#############################
+DeviceType=`uname -m`
+CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'RT2'_'perf'
+ActualFPS=${FPS}
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型修改
+grep INFO:tensorflow:loss $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt
+
+ActualLoss=`awk 'END {print}' $cur_path/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt`
+echo "Network = ${Network}" > $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "BatchSize = ${batch_size}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+
diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_8p_mobilenetv2.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_8p_mobilenetv2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc43bb9f47cda50606fdd222709a77a5e9251
--- /dev/null
+++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_RT2_performance_8p_mobilenetv2.sh
@@ -0,0 +1,149 @@
+#!bin/bash
+cur_path=`pwd`
+
+#环境设置，需要根据网络修改
+export PYTHONPATH=$cur_path/../models/research:$cur_path/../models/research/slim:$PYTHONPATH
+export HCCL_CONNECT_TIMEOUT=200
+#集合通信
+export RANK_SIZE=8
+export RANK_TABLE_FILE=$cur_path/../configs/${RANK_SIZE}p.json
+export JOB_ID=10087
+RANK_ID_START=0
+ASCEND_DEVICE_ID_START=0
+
+#使能RT2.0
+export ENABLE_RUNTIME_V2=1
+
+#数据集参数
+data_path="/data"
+use_conda=0
+
+#训练参数，需要根据模型修改
+Network="SSD-MobilenetV2_ID0499_for_TensorFlow"
+num_train_steps=1000
+batch_size=24
+ckpt_path=/checkpoints
+pipeline_config=$cur_path/../models/research/configs/ssd_mobilenet_v2_coco_8p.config
+
+#帮助提示，需要根据网络修改
+if [[ $1 == --help || $1 == -h ]];then 
+    echo "usage: ./train_performance_8p.sh <args>"
+    echo ""
+    echo "parameter explain:
+    --num_train_steps           training steps
+    --data_path                 source data of training
+    --ckpt_path                  pre-checkpoint path
+    --pipeline_config           pipeline config path
+	--skip_eval  whether to skip eval
+    -h/--help                   Show help message
+    "
+    exit 1
+fi
+
+#入参设置，需要根据网络修改
+for para in $*
+do
+    if [[ $para == --num_train_steps* ]];then
+		num_train_steps=`echo ${para#*=}`
+	elif [[ $para == --data_path* ]];then
+		data_path=`echo ${para#*=}`
+	elif [[ $para == --ckpt_path* ]];then
+		ckpt_path=`echo ${para#*=}`
+	elif [[ $para == --pipeline_config* ]];then
+		pipeline_config=`echo ${para#*=}`
+    elif [[ $para == --use_conda* ]];then
+	    use_conda=`echo ${para#*=}`
+	elif [[ $para == --skip_eval* ]];then
+	    skip_eval=`echo ${para#*=}`
+    fi
+done	
+
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be config"
+    exit 1
+fi
+
+##########################执行训练#########################
+start_time=$(date +%s)
+cd $cur_path/../models/research
+if [  -f ${pipeline_config}.bak ];then
+   cp ${pipeline_config}.bak ${pipeline_config}
+else
+   cp ${pipeline_config} ${pipeline_config}.bak
+fi
+
+# 更改参数
+sed -i "s%/data/coco2017_tfrecords%${data_path}/coco2017_tfrecords%p" ${pipeline_config}
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+    do
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$((ASCEND_DEVICE_ID_START+RANK_ID))
+    echo "Device ID: $ASCEND_DEVICE_ID"
+    if [   -d $cur_path/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf $cur_path/output/${ASCEND_DEVICE_ID}
+        mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
+    else
+        mkdir -p $cur_path/output/${ASCEND_DEVICE_ID}
+    fi
+
+    #训练执行脚本，需要根据网络修改
+    nohup python3 -u ./object_detection/model_main_rt.py \
+        --pipeline_config_path=${pipeline_config} \
+        --model_dir=$cur_path/output/${ASCEND_DEVICE_ID}/npu_ckpt_mobilenetv2_${RANK_SIZE}p\
+        --data_path=${data_path}   \
+        --alsologtostder \
+        --amp \
+        --skip_eval=True \
+        --num_train_steps=${num_train_steps}  \
+        "${@:1}"  > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+done
+wait
+
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+echo "Final Training Duration sec : $e2e_time"
+ASCEND_DEVICE_ID=0
+
+# 参数回改
+sed -i "s%${data_path}/coco2017_tfrecords%/data/coco2017_tfrecords%p" ${pipeline_config}
+
+
+################################性能结果处理#########################
+echo "-----------------------Final result------------------------"
+# 性能FPS计算，需要根据网络修改
+step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $2}'|tail -2|head -n 1`
+FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${step_sec}'}'`
+echo "Final Performance images/sec : ${FPS}"
+
+#################################精度结果处理#########################
+# 精度计算，需要根据网络修改
+train_accuracy=`grep Precision $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'NR==1 {print $13}'`
+echo "Final Training Accuracy mAP: ${train_accuracy}"
+
+#################################性能看护#############################
+# 训练用例信息，不需要修改
+DeviceType=`uname -m`
+BatchSize=${batch_size}
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'perf'
+ActualFPS=${FPS}
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'`
+
+#################################Loss#########################
+ASCEND_DEVICE_ID=7
+# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型修改
+grep INFO:tensorflow:loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $3}'|sed 's/,//g'|sed '/^$/d' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+echo "Final Training Accuracy loss: ${ActualLoss}"
+
+# 关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/cnn_lstm_otc_ocr_rt.py b/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/cnn_lstm_otc_ocr_rt.py
new file mode 100644
index 0000000000000000000000000000000000000000..6207ae2dd761573bcfa750621ebd50b1c7ff7fd2
--- /dev/null
+++ b/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/cnn_lstm_otc_ocr_rt.py
@@ -0,0 +1,242 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+
+"""
+from npu_bridge.npu_init import *
+
+import tensorflow as tf
+import utils
+from tensorflow.python.framework import dtypes
+from npu_bridge.estimator.npu.npu_dynamic_rnn import DynamicRNN
+
+FLAGS = utils.FLAGS
+num_classes = utils.num_classes
+
+
+class LSTMOCR(object):
+    def __init__(self, mode):
+        self.mode = mode
+        # image
+        self.inputs = tf.placeholder(tf.float32, [None, FLAGS.image_height, FLAGS.image_width, FLAGS.image_channel])
+        # SparseTensor required by ctc_loss op
+        self.labels = tf.sparse_placeholder(tf.int32)
+        # 1d array of size [batch_size]
+        # self.seq_len = tf.placeholder(tf.int32, [None])
+        # l2
+        self._extra_train_ops = []
+
+    def build_graph(self):
+        self._build_model()
+        self._build_train_op()
+
+        self.merged_summay = tf.summary.merge_all()
+
+    def _build_model(self):
+        filters = [1, 64, 128, 128, FLAGS.out_channels]
+        strides = [1, 2]
+
+        feature_h = FLAGS.image_height
+        feature_w = FLAGS.image_width
+
+        count_ = 0
+        min_size = min(FLAGS.image_height, FLAGS.image_width)
+        while min_size > 1:
+            min_size = (min_size + 1) // 2
+            count_ += 1
+        assert (FLAGS.cnn_count <= count_, "FLAGS.cnn_count should be <= {}!".format(count_))
+
+        # CNN part
+        with tf.variable_scope('cnn'):
+            x = self.inputs
+            for i in range(FLAGS.cnn_count):
+                with tf.variable_scope('unit-%d' % (i + 1)):
+                    x = self._conv2d(x, 'cnn-%d' % (i + 1), 3, filters[i], filters[i + 1], strides[0])
+                    x = self._batch_norm('bn%d' % (i + 1), x)
+                    x = self._leaky_relu(x, FLAGS.leakiness)
+                    x = self._max_pool(x, 2, strides[1])
+
+                    # print('----x.get_shape().as_list(): {}'.format(x.get_shape().as_list()))
+                    _, feature_h, feature_w, _ = x.get_shape().as_list()
+            print('\nfeature_h: {}, feature_w: {}'.format(feature_h, feature_w))
+
+        # LSTM part
+        with tf.variable_scope('lstm'):
+            x = tf.transpose(x, [0, 2, 1, 3])  # [batch_size, feature_w, feature_h, FLAGS.out_channels]
+            # treat `feature_w` as max_timestep in lstm.
+            x = tf.reshape(x, [FLAGS.batch_size, feature_w, feature_h * FLAGS.out_channels])
+            print('lstm input shape: {}'.format(x.get_shape().as_list()))
+            self.seq_len = tf.fill([x.get_shape().as_list()[0]], feature_w)
+            # print('self.seq_len.shape: {}'.format(self.seq_len.shape.as_list()))
+
+            # tf.nn.rnn_cell.RNNCell, tf.nn.rnn_cell.GRUCell
+            '''
+            cell = tf.nn.rnn_cell.LSTMCell(FLAGS.num_hidden, state_is_tuple=True)
+            if self.mode == 'train':
+                cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=FLAGS.output_keep_prob)
+
+            cell1 = tf.nn.rnn_cell.LSTMCell(FLAGS.num_hidden, state_is_tuple=True)
+            if self.mode == 'train':
+                cell1 = tf.nn.rnn_cell.DropoutWrapper(cell=cell1, output_keep_prob=FLAGS.output_keep_prob)
+
+            # Stacking rnn cells
+            stack = tf.nn.rnn_cell.MultiRNNCell([cell, cell1], state_is_tuple=True)
+            initial_state = stack.zero_state(FLAGS.batch_size, dtype=tf.float32)
+
+            # The second output is the last state and we will not use that
+            outputs, _ = tf.nn.dynamic_rnn(
+                cell=stack,
+                inputs=x,
+                sequence_length=self.seq_len,
+                initial_state=initial_state,
+                dtype=tf.float32,
+                time_major=False
+            )  # [batch_size, max_stepsize, FLAGS.num_hidden]
+            '''
+            # replace lstm compose above
+            inputdata = tf.transpose(x, [1, 0, 2])
+            fw_cell1 = DynamicRNN(FLAGS.num_hidden, dtypes.float32, time_major=True, forget_bias=1.0)
+            fw_cell2 = DynamicRNN(FLAGS.num_hidden, dtypes.float32, time_major=True, forget_bias=1.0)
+            y, _, _, _, _, _, _, _ = fw_cell1(inputdata)
+            if self.mode == 'train':
+                y = npu_ops.dropout(y, 0.8)
+            outputs, _, _, _, _, _, _, _ = fw_cell2(y)
+            if self.mode == 'train':
+                outputs = npu_ops.dropout(outputs, 0.8)
+            outputs = tf.transpose(outputs, [1, 0, 2])
+
+            # Reshaping to apply the same weights over the timesteps
+            outputs = tf.reshape(outputs, [-1, FLAGS.num_hidden])  # [batch_size * max_stepsize, FLAGS.num_hidden]
+
+            W = tf.get_variable(name='W_out',
+                                shape=[FLAGS.num_hidden, num_classes],
+                                dtype=tf.float32,
+                                initializer=tf.glorot_uniform_initializer())  # tf.glorot_normal_initializer
+            b = tf.get_variable(name='b_out',
+                                shape=[num_classes],
+                                dtype=tf.float32,
+                                initializer=tf.constant_initializer())
+
+            self.logits = tf.matmul(outputs, W) + b
+            # Reshaping back to the original shape
+            shape = tf.shape(x)
+            self.logits = tf.reshape(self.logits, [shape[0], -1, num_classes])
+            # Time major
+            self.logits = tf.transpose(self.logits, (1, 0, 2))
+
+    def _build_train_op(self):
+        # self.global_step = tf.Variable(0, trainable=False)
+        self.global_step = tf.train.get_or_create_global_step()
+
+        self.loss = tf.nn.ctc_loss(labels=self.labels,
+                                   inputs=self.logits,
+                                   sequence_length=self.seq_len)
+        ####NPU modify begin####
+        # self.loss = util.set_graph_exec_config(self.loss, True, "dynamic_execute", "data:[128~128,60~60,180~180,1~1],[640~896],[640~896,2]")
+        ####NPU modify end####
+        self.cost = tf.reduce_mean(self.loss)
+        tf.summary.scalar('cost', self.cost)
+
+        self.lrn_rate = tf.train.exponential_decay(FLAGS.initial_learning_rate,
+                                                   self.global_step,
+                                                   FLAGS.decay_steps,
+                                                   FLAGS.decay_rate,
+                                                   staircase=True)
+        tf.summary.scalar('learning_rate', self.lrn_rate)
+
+        # self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.lrn_rate,
+        #                                            momentum=FLAGS.momentum).minimize(self.cost,
+        #                                                                              global_step=self.global_step)
+        # self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.lrn_rate,
+        #                                             momentum=FLAGS.momentum,
+        #                                             use_nesterov=True).minimize(self.cost,
+        #                                                                         global_step=self.global_step)
+        self.optimizer = npu_tf_optimizer(tf.train.AdamOptimizer(learning_rate=self.lrn_rate,
+                                                beta1=FLAGS.beta1,
+                                                beta2=FLAGS.beta2)).minimize(self.loss,
+                                                                            global_step=self.global_step)
+        train_ops = [self.optimizer] + self._extra_train_ops
+        self.train_op = tf.group(*train_ops)
+
+        # Option 2: tf.nn.ctc_beam_search_decoder
+        # (it's slower but you'll get better results)
+        self.decoded, self.log_prob = \
+            tf.nn.ctc_beam_search_decoder(self.logits,
+                                          self.seq_len,
+                                          merge_repeated=False)
+        # self.decoded, self.log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len,merge_repeated=False)
+        self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0], default_value=-1)
+
+    def _conv2d(self, x, name, filter_size, in_channels, out_channels, strides):
+        with tf.variable_scope(name):
+            kernel = tf.get_variable(name='W',
+                                     shape=[filter_size, filter_size, in_channels, out_channels],
+                                     dtype=tf.float32,
+                                     initializer=tf.glorot_uniform_initializer())  # tf.glorot_normal_initializer
+
+            b = tf.get_variable(name='b',
+                                shape=[out_channels],
+                                dtype=tf.float32,
+                                initializer=tf.constant_initializer())
+
+            con2d_op = tf.nn.conv2d(x, kernel, [1, strides, strides, 1], padding='SAME')
+
+        return tf.nn.bias_add(con2d_op, b)
+
+    def _batch_norm(self, name, x):
+        """Batch normalization."""
+        with tf.variable_scope(name):
+            x_bn = \
+                tf.contrib.layers.batch_norm(
+                    inputs=x,
+                    decay=0.9,
+                    center=True,
+                    scale=True,
+                    epsilon=1e-5,
+                    updates_collections=None,
+                    is_training=self.mode == 'train',
+                    fused=True,
+                    data_format='NHWC',
+                    zero_debias_moving_mean=True,
+                    scope='BatchNorm'
+                )
+
+        return x_bn
+
+    def _leaky_relu(self, x, leakiness=0.0):
+        return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
+
+    def _max_pool(self, x, ksize, strides):
+        return tf.nn.max_pool(x,
+                              ksize=[1, ksize, ksize, 1],
+                              strides=[1, strides, strides, 1],
+                              padding='SAME',
+                              name='max_pool')
+
diff --git a/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/main_rt.py b/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/main_rt.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e81c1ac17c2e682cf7558eaa2a73d748f18db03
--- /dev/null
+++ b/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/main_rt.py
@@ -0,0 +1,255 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+
+"""
+from npu_bridge.npu_init import *
+
+import datetime
+import logging
+import os
+import time
+
+import cv2
+import numpy as np
+import tensorflow as tf
+
+import cnn_lstm_otc_ocr_rt
+import utils
+import helper
+
+FLAGS = utils.FLAGS
+
+logger = logging.getLogger('Traing for OCR using CNN+LSTM+CTC')
+logger.setLevel(logging.INFO)
+
+
+def train(train_dir=None, val_dir=None, mode='train'):
+    model = cnn_lstm_otc_ocr.LSTMOCR(mode)
+    model.build_graph()
+
+    print('loading train data')
+    train_feeder = utils.DataIterator(data_dir=train_dir)
+    print('size: ', train_feeder.size)
+
+    print('loading validation data')
+    val_feeder = utils.DataIterator(data_dir=val_dir)
+    print('size: {}\n'.format(val_feeder.size))
+
+    num_train_samples = train_feeder.size  # 100000
+    num_batches_per_epoch = int(num_train_samples / FLAGS.batch_size)  # example: 100000/100
+
+    num_val_samples = val_feeder.size
+    num_batches_per_epoch_val = int(num_val_samples / FLAGS.batch_size)  # example: 10000/100
+    shuffle_idx_val = np.random.permutation(num_val_samples)
+
+    config = tf.ConfigProto(allow_soft_placement=True)
+    config.gpu_options.allow_growth = True
+    # ***** npu modify begin *****
+    global_config = tf.ConfigProto()
+    custom_op = global_config.graph_options.rewrite_options.custom_optimizers.add()
+    custom_op.name = "NpuOptimizer"
+    custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
+    #custom_op.parameter_map["dynamic_input"].b = 1
+    #custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile")
+    custom_op.parameter_map["jit_compile"].b = False
+    global_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+    global_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
+    # ***** npu modify end ******
+    
+    with tf.Session(config=global_config) as sess:
+    #with tf.Session(config=npu_config_proto(config_proto=config)) as sess:
+        sess.run(tf.global_variables_initializer())
+
+        saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
+        train_writer = tf.summary.FileWriter(FLAGS.logs_dir + '/train', sess.graph)
+        if FLAGS.restore:
+            ckpt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
+            if ckpt:
+                # the global_step will restore sa well
+                saver.restore(sess, ckpt)
+                print('restore from checkpoint{0}'.format(ckpt))
+
+        print('=============================begin training=============================')
+        for cur_epoch in range(FLAGS.num_epochs):
+            shuffle_idx = np.random.permutation(num_train_samples)
+            train_cost = 0
+            start_time = time.time()
+            batch_time = time.time()
+
+            # the training part
+            for cur_batch in range(num_batches_per_epoch):
+                if (cur_batch + 1) % 100 == 0:
+                    print('batch', cur_batch, ': time', time.time() - batch_time)
+                batch_time = time.time()
+                indexs = [shuffle_idx[i % num_train_samples] for i in
+                          range(cur_batch * FLAGS.batch_size, (cur_batch + 1) * FLAGS.batch_size)]
+                batch_inputs, _, batch_labels = \
+                    train_feeder.input_index_generate_batch(indexs)
+                # batch_inputs,batch_seq_len,batch_labels=utils.gen_batch(FLAGS.batch_size)
+                feed = {model.inputs: batch_inputs,
+                        model.labels: batch_labels}
+
+                # if summary is needed
+                summary_str, batch_cost, step, _ = \
+                    sess.run([model.merged_summay, model.cost, model.global_step, model.train_op], feed)
+                # calculate the cost
+                train_cost += batch_cost * FLAGS.batch_size
+
+                train_writer.add_summary(summary_str, step)
+
+                # save the checkpoint
+                if step % FLAGS.save_steps == 1:
+                    if not os.path.isdir(FLAGS.checkpoint_dir):
+                        os.mkdir(FLAGS.checkpoint_dir)
+                    # logger.info('save checkpoint at step {0}', format(step))
+                    saver.save(sess, os.path.join(FLAGS.checkpoint_dir, 'ocr-model'), global_step=step)
+
+                # train_err += the_err * FLAGS.batch_size
+                # do validation
+                if step % FLAGS.validation_steps == 0:
+                    acc_batch_total = 0
+                    lastbatch_err = 0
+                    lr = 0
+                    for j in range(num_batches_per_epoch_val):
+                        indexs_val = [shuffle_idx_val[i % num_val_samples] for i in
+                                      range(j * FLAGS.batch_size, (j + 1) * FLAGS.batch_size)]
+                        val_inputs, _, val_labels = \
+                            val_feeder.input_index_generate_batch(indexs_val)
+                        val_feed = {model.inputs: val_inputs,
+                                    model.labels: val_labels}
+
+                        dense_decoded, lastbatch_err, lr = \
+                            sess.run([model.dense_decoded, model.cost, model.lrn_rate],
+                                     val_feed)
+
+                        # print the decode result
+                        ori_labels = val_feeder.the_label(indexs_val)
+                        acc = utils.accuracy_calculation(ori_labels, dense_decoded,
+                                                         ignore_value=-1, isPrint=True)
+                        acc_batch_total += acc
+
+                    accuracy = (acc_batch_total * FLAGS.batch_size) / num_val_samples
+
+                    avg_train_cost = train_cost / ((cur_batch + 1) * FLAGS.batch_size)
+
+                    # train_err /= num_train_samples
+                    now = datetime.datetime.now()
+                    log = "{}/{} {}:{}:{} Epoch {}/{}, " \
+                          "accuracy = {:.3f},avg_train_cost = {:.3f}, " \
+                          "lastbatch_err = {:.3f}, time = {:.3f},lr={:.8f}"
+                    print(log.format(now.month, now.day, now.hour, now.minute, now.second,
+                                     cur_epoch + 1, FLAGS.num_epochs, accuracy, avg_train_cost,
+                                     lastbatch_err, time.time() - start_time, lr))
+
+
+def infer(img_path, mode='infer'):
+    # imgList = load_img_path('/home/yang/Downloads/FILE/ml/imgs/image_contest_level_1_validate/')
+    imgList = helper.load_img_path(img_path)
+    print(imgList[:5])
+
+    model = cnn_lstm_otc_ocr.LSTMOCR(mode)
+    model.build_graph()
+
+    total_steps = len(imgList) / FLAGS.batch_size
+
+    config = tf.ConfigProto(allow_soft_placement=True)
+    with tf.Session(config=npu_config_proto(config_proto=config)) as sess:
+        sess.run(tf.global_variables_initializer())
+
+        saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
+        ckpt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
+        if ckpt:
+            saver.restore(sess, ckpt)
+            print('restore from ckpt{}'.format(ckpt))
+        else:
+            print('cannot restore')
+
+        decoded_expression = []
+        for curr_step in range(total_steps):
+
+            imgs_input = []
+            seq_len_input = []
+            for img in imgList[curr_step * FLAGS.batch_size: (curr_step + 1) * FLAGS.batch_size]:
+                im = cv2.imread(img, 0).astype(np.float32) / 255.
+                im = np.reshape(im, [FLAGS.image_height, FLAGS.image_width, FLAGS.image_channel])
+
+                def get_input_lens(seqs):
+                    length = np.array([FLAGS.max_stepsize for _ in seqs], dtype=np.int64)
+
+                    return seqs, length
+
+                inp, seq_len = get_input_lens(np.array([im]))
+                imgs_input.append(im)
+                seq_len_input.append(seq_len)
+
+            imgs_input = np.asarray(imgs_input)
+            seq_len_input = np.asarray(seq_len_input)
+            seq_len_input = np.reshape(seq_len_input, [-1])
+
+            feed = {model.inputs: imgs_input}
+            dense_decoded_code = sess.run(model.dense_decoded, feed)
+
+            for item in dense_decoded_code:
+                expression = ''
+
+                for i in item:
+                    if i == -1:
+                        expression += ''
+                    else:
+                        expression += utils.decode_maps[i]
+
+                decoded_expression.append(expression)
+
+        with open('./result.txt', 'a') as f:
+            for code in decoded_expression:
+                f.write(code + '\n')
+
+
+def main(_):
+    if FLAGS.num_gpus == 0:
+        dev = '/cpu:0'
+    elif FLAGS.num_gpus == 1:
+        dev = '/gpu:0'
+    else:
+        raise ValueError('Only support 0 or 1 gpu.')
+
+    with tf.device('/cpu:0'):
+        if FLAGS.mode == 'train':
+            train(FLAGS.train_dir, FLAGS.val_dir, FLAGS.mode)
+
+        elif FLAGS.mode == 'infer':
+            infer(FLAGS.infer_dir, FLAGS.mode)
+
+
+if __name__ == '__main__':
+    tf.logging.set_verbosity(tf.logging.INFO)
+    tf.app.run()
+
diff --git a/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/test/train_RT2_performance_1p.sh b/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/test/train_RT2_performance_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ed46648fd31f67864388d70a0077b2ac25a47d70
--- /dev/null
+++ b/TensorFlow/built-in/nlp/CNN-CTC_ID0683_for_TensorFlow/test/train_RT2_performance_1p.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+cur_path=`pwd`/../
+
+#基础参数，需要模型审视修改
+#Batch Size
+batch_size=128
+#网络名称，同目录名称
+Network="CNN-CTC_ID0683_for_TensorFlow"
+#Device数量，单卡默认为1
+RankSize=1
+#训练epoch，可选
+train_epochs=2
+#学习率
+learning_rate=0.0001
+
+#使能RT2.0
+export ENABLE_RUNTIME_V2=1
+
+if [[ $1 == --help || $1 == --h ]];then
+   echo "usage:./train_performance_1p.sh --data_path=./imgs"
+   exit 1
+fi
+
+for para in $*
+do
+   if [[ $para == --data_path* ]];then
+      data_path=`echo ${para#*=}`
+   fi
+done
+
+if [[ $data_path  == "" ]];then
+   echo "[Error] para \"data_path \" must be config"
+   exit 1
+fi
+##############执行训练##########
+wait
+cd $cur_path
+if [ -d $cur_path/test/output ];then
+   rm -rf $cur_path/test/output/*
+   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
+else
+   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
+fi
+wait
+
+start=$(date +%s)
+nohup python3 main_rt.py --train_dir=${data_path}/train/ \
+	--val_dir=${data_path}/val \
+	--image_height=60 \
+	--image_width=180 \
+	--image_channel=1 \
+	--out_channels=64 \
+	--num_hidden=128 \
+	--batch_size=$batch_size \
+	--logs_dir=./log \
+	--num_gpus=1 \
+    --initial_learning_rate=$learning_rate \
+	--num_epochs=${train_epochs} \
+	--mode=train > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+wait
+end=$(date +%s)
+e2e_time=$(( $end - $start ))
+
+#echo "Final Performance ms/step : $average_perf"
+echo "Final Training Duration sec : $e2e_time"
+
+TrainingTime=`grep "batch " $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |awk 'END {print $5}'`
+FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${TrainingTime}'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep "accuracy" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $7}'|cut -d , -f 1`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "lastbatch_err" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $12}' | cut -d , -f 1 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print $1}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log