diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/base.py b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/base.py
index c7ff3a36ae99ed24f495d6e80607403c3d84a19c..87bc7b8d35a9470fdcf172f15198e92af1a4b1aa 100644
--- a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/base.py
+++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/base.py
@@ -26,8 +26,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from npu_bridge.npu_init import *
+from npu_bridge.hccl import hccl_ops
+
 import tensorflow as tf
 import tensorflow.contrib.slim as slim
 import numpy as np
@@ -45,6 +46,8 @@ from .timer import Timer
 from .logger import colorlogger
 from .utils import approx_equal
 
+rank_size = int(os.getenv('RANK_SIZE'))
+
 class ModelDesc(object):
     __metaclass__ = abc.ABCMeta
     def __init__(self):
@@ -148,6 +151,19 @@ class Base(object):
         # initialize tensorflow
         tfconfig = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
         tfconfig.gpu_options.allow_growth = True
+
+        #############npu modify start###############
+        custom_op = tfconfig.graph_options.rewrite_options.custom_optimizers.add()
+        custom_op.name = "NpuOptimizer"
+        custom_op.parameter_map["use_off_line"].b = True
+        #
+        if int(rank_size) > 1:
+            custom_op.parameter_map["hcom_parallel"].b = True
+        #
+        tfconfig.graph_options.rewrite_options.remapping = RewriterConfig.OFF  # off remap
+        tfconfig.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
+        #############npu modify end###############
+
         self.sess = tf.Session(config=npu_config_proto(config_proto=tfconfig))
 
         # build_graph
@@ -240,8 +256,13 @@ class Trainer(Base):
 
         data_load_thread.reset_state()
         dataiter = data_load_thread.get_data()
+		
+        if int(rank_size) > 1:
+            itr_per_epoch = math.ceil(len(train_data)/self.cfg.batch_size/self.cfg.num_gpus/rank_size)
+        else:
+            itr_per_epoch = math.ceil(len(train_data)/self.cfg.batch_size/self.cfg.num_gpus)
 
-        return dataiter, math.ceil(len(train_data)/self.cfg.batch_size/self.cfg.num_gpus) 
+        return dataiter, itr_per_epoch
 
     def _make_graph(self):
         self.logger.info("Generating training graph on {} GPUs ...".format(self.cfg.num_gpus))
@@ -310,6 +331,12 @@ class Trainer(Base):
         self.sess.run(tf.variables_initializer(tf.global_variables(), name='init'))
         self.load_weights('last_epoch' if self.cfg.continue_train else self.cfg.init_model)
 
+        rank_size = int(os.getenv('RANK_SIZE'))
+        if int(rank_size) > 1:
+            input = tf.trainable_variables()
+            bcast_global_variables_op = hccl_ops.broadcast(input, 0)
+            self.sess.run(bcast_global_variables_op)
+
         self.logger.info('Start training ...')
         start_itr = self.cur_epoch * self.itr_per_epoch + 1
         end_itr = self.itr_per_epoch * self.cfg.end_epoch + 1
diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/data_provider.py b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/data_provider.py
index db62e5fd56a31ff7261f47dfacf42182602afd5c..bf21411e98ed57aaa199edb5d5a93a5e18988f3c 100644
--- a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/data_provider.py
+++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/data_provider.py
@@ -355,10 +355,22 @@ class BatchData(object):
         Yields:
             Batched data by stacking each component on an extra 0th dimension.
         """
+        # shard by rankid
+        rank_size = int(os.getenv('RANK_SIZE'))
+        if int(rank_size) > 1:
+            rank_id = int(os.getenv('RANK_ID'))
+        else :
+            rank_id = 0
+
+        i = 0
         holder = []
         for data in self.ds.get_data():
-            holder.append(data)
+            # holder.append(data)
+            if (i // self.batch_size) == rank_id:
+                holder.append(data)
+            i += 1
             if len(holder) == self.batch_size:
+                i = 0
                 yield BatchData._aggregate_batch(holder, self.use_list)
                 del holder[:]
 
diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/net_utils.py b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/net_utils.py
index ea0df01e1cd41158ac6a1038fab6578aef67c4a8..918718cbb4a4b0970f5722c6ba40343e59fb4c82 100644
--- a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/net_utils.py
+++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/net_utils.py
@@ -129,6 +129,7 @@ def get_optimizer(lr, optimizer='momentum'):
         optimizer = tf.train.AdamOptimizer(lr)
     else:
         raise ValueError('invalid optimizer')
+    optimizer = npu_distributed_optimizer_wrapper(optimizer)
     return optimizer
 
 def get_tower_summary_dict(summary):
diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/main/config.py b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/main/config.py
index 22ab2590283bc8a1cfe64737bbb63ca828fb5bdb..51a29b347c13da890f59a6c84a2e1420a3bb977b 100644
--- a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/main/config.py
+++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/main/config.py
@@ -75,6 +75,7 @@ class Config:
     pixel_means = np.array([[[123.68, 116.78, 103.94]]])
 
     ## training config
+    rank_size = int(os.getenv('RANK_SIZE'))
     lr_dec_epoch = [90, 120]
     end_epoch = 140
     lr = 5e-4
diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_full_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b3fcb2e67f473bd65f187ba71bb528686fbc1695
--- /dev/null
+++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_full_8p.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+
+export RANK_SIZE=1
+export JOB_ID=10087
+export RANK_ID_START=0
+export RANK_TABLE_FILE=/hdu/zhengleilei/SimpleHumanPose_ID0956_for_TensorFlow_1P/configs/rank_table_8p.json
+
+
+
+cur_path=`pwd`
+data_path=""
+ckpt_path=""
+Network="SimpleHumanPose_ID0956_for_TensorFlow"
+#batch_size=32
+batch_size=256
+epoch=140
+# train_performance_1p.sh perf
+# train_full_1p.sh acc
+CaseName="${Network}_bs${batch_size}_${RANK_SIZE}p_acc"
+
+
+if [[ $1 == --help || $1 == -h ]];then
+    echo"usage:./train_full_1p.sh <args>"
+    echo " "
+    echo "parameter explain:
+    --precision_mode         precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+    --over_dump                if or not over detection, default is False
+    --data_dump_flag             data dump flag, default is False
+    --data_dump_step             data dump step, default is 10
+    --profiling                if or not profiling for performance debug, default is False
+    --data_path                source data of training
+    -h/--help                    show help message
+    "
+    exit 1
+fi
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+        echo "${data_path}"
+    elif [[ $para == --ckpt_path* ]];then
+        ckpt_path=`echo ${para#*=}`
+        echo "${ckpt_path}"
+    elif [[ $para == --batch_size* ]];then
+        batch_size=`echo ${para#*=}`
+        echo "${batch_size}"
+    elif [[ $para == --max_steps* ]];then
+        max_steps=`echo ${para#*=}`
+        echo "${max_steps}"
+    fi
+done
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+cd $cur_path/../
+rm -rf ${data_path}/model_dump/COCO/*
+# CHANGE PARM
+# data_path
+sed -i "72a\ \ \ \ data_path = \'$data_path\'" src/data/COCO/dataset.py
+sed -i "53a\ \ \ \ data_path = \'$data_path\'" src/main/config.py
+# end_epoch
+# sed -i "s/end_epoch = 140/end_epoch = 1/g" src/main/config.py
+# sed -i "s/test_epoch = 140/test_epoch = 1/g" src/main/test_my.py
+# START
+start_time=$(date +%s)
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    echo "Device ID: $RANK_ID"
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    ASCEND_DEVICE_ID=$RANK_ID
+	  DEVICE_INDEX=$RANK_ID
+    export DEVICE_INDEX=${DEVICE_INDEX}
+    if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+        mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt
+    else
+        mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt
+    fi
+    # train
+    nohup python3 -u src/main/train.py > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    wait
+    # eval
+    mkdir -p ${data_path}/model_dump/COCO/
+    mv cache/result/model_dump/COCO/snapshot_${epoch}.ckpt* ${data_path}/model_dump/COCO/
+    echo "mv cache/result/model_dump/COCO/snapshot_${epoch}.ckpt* ${data_path}/model_dump/COCO/" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log
+    nohup python3 -u src/main/test_my.py >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done 
+wait
+end_time=$(date +%s)
+e2e_time=$(( ${end_time} - ${start_time} ))
+# sed -i "s/end_epoch = 1/end_epoch = 140/g" src/main/config.py
+# sed -i "s/test_epoch = 1/test_epoch = 140/g" src/main/test_my.py
+rm -rf ${data_path}/model_dump/COCO/*
+
+
+echo "------------------ Final result ------------------"
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+# getFPS
+Time=`grep loss ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Epoch 0' | awk -F's/itr ' 'END{print $2}' | awk -F'h' '{print 60*60*$1}'`
+FPS=`awk 'BEGIN{printf "%.4f\n",'${batch_size}'*4682/'${Time}'}'`
+ActualFPS=${FPS}
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*4682/'${FPS}'}'`
+# getAcc
+train_accuracy=`grep 'IoU' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F'] = ' '{print $2}' | head -n 1`
+# getLoss
+grep 'loss' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep Epoch | awk -F'loss: ' '{print $2}' > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt
+ActualLoss=`awk 'END {print}' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt`
+echo "Final Performance images/sec : ${FPS}"
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : ${e2e_time}"
+
+
+echo "Network = ${Network}"                  > ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}"              >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "BatchSize = ${BatchSize}"             >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "DeviceType = ${DeviceType}"           >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "CaseName = ${CaseName}"               >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}"             >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}"       >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}"    >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}"           >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}"        >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_performance_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..be83794d8b98518e93d1d57ba72d6ae1f4ed62cb
--- /dev/null
+++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_performance_8p.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+
+export RANK_SIZE=8
+export JOB_ID=10087
+export RANK_ID_START=0
+export RANK_TABLE_FILE=/hdu/zhengleilei/SimpleHumanPose_ID0956_for_TensorFlow_1P/configs/rank_table_8p.json
+
+
+
+cur_path=`pwd`
+data_path=""
+ckpt_path=""
+Network="SimpleHumanPose_ID0956_for_TensorFlow"
+#batch_size=32
+batch_size=256
+epoch=1
+# train_performance_1p.sh perf
+# train_full_1p.sh acc
+CaseName="${Network}_bs${batch_size}_${RANK_SIZE}p_perf"
+
+
+if [[ $1 == --help || $1 == -h ]];then
+    echo"usage:./train_performance_1p.sh <args>"
+    echo " "
+    echo "parameter explain:
+    --precision_mode         precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+    --over_dump		           if or not over detection, default is False
+    --data_dump_flag		     data dump flag, default is False
+    --data_dump_step		     data dump step, default is 10
+    --profiling		           if or not profiling for performance debug, default is False
+    --data_path		           source data of training
+    -h/--help		             show help message
+    "
+    exit 1
+fi
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+        echo "${data_path}"
+    elif [[ $para == --ckpt_path* ]];then
+        ckpt_path=`echo ${para#*=}`
+        echo "${ckpt_path}"
+    elif [[ $para == --batch_size* ]];then
+        batch_size=`echo ${para#*=}`
+        echo "${batch_size}"
+    elif [[ $para == --max_steps* ]];then
+        max_steps=`echo ${para#*=}`
+        echo "${max_steps}"
+    fi
+done
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+cd $cur_path/../
+# CHANGE PARM
+# data_path
+sed -i "72a\ \ \ \ data_path = \'$data_path\'" src/data/COCO/dataset.py
+sed -i "52a\ \ \ \ data_path = \'$data_path\'" src/main/config.py
+# end_epoch
+sed -i "s/end_epoch = 140/end_epoch = 1/g" src/main/config.py
+# START
+start_time=$(date +%s)
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    echo "Device ID: $RANK_ID"
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    ASCEND_DEVICE_ID=$RANK_ID
+	  DEVICE_INDEX=$RANK_ID
+    export DEVICE_INDEX=${DEVICE_INDEX}
+    if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+        mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt
+    else
+        mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt
+    fi
+    nohup python3 -u src/main/train.py > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    # wait
+    # eval
+    # cp -r cache/result/model_dump ${data_path}/
+    # wait
+    # nohup python3 -u main/test_my.py >> > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done 
+wait
+end_time=$(date +%s)
+e2e_time=$(( ${end_time} - ${start_time} ))
+sed -i "s/end_epoch = 1/end_epoch = 140/g" src/main/config.py
+
+
+echo "------------------ Final result ------------------"
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+# getFPS
+Time=`grep loss ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Epoch 0' | awk -F's/itr ' 'END{print $2}' | awk -F'h' '{print 60*60*$1}'`
+FPS=`awk 'BEGIN{printf "%.4f\n",'${batch_size}'*4682/'${Time}'}'`
+ActualFPS=${FPS}
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*4682/'${FPS}'}'`
+# getAcc
+train_accuracy="None"
+# getLoss
+grep 'loss' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep Epoch | awk -F'loss: ' '{print $2}' > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt
+ActualLoss=`awk 'END {print}' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt`
+echo "Final Performance images/sec : ${FPS}"
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : ${e2e_time}"
+
+
+echo "Network = ${Network}"                  > ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}"              >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "BatchSize = ${BatchSize}"             >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "DeviceType = ${DeviceType}"           >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "CaseName = ${CaseName}"               >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}"             >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}"       >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}"    >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}"           >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}"        >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log