diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py
index 9494faa52a64a35667665871d1ac5f3eb4c08156..1b147b39602045c8c04962f5c0ac010824a54a17 100644
--- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py
+++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py
@@ -32,6 +32,7 @@ Data loader with data augmentation.
 Only used for training.
 """
 from __future__ import absolute_import
+from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
@@ -42,6 +43,7 @@ import tensorflow as tf
 
 from tf_smpl.batch_lbs import batch_rodrigues
 from util import data_utils
+import os
 
 _3D_DATASETS = ['h36m', 'up', 'mpi_inf_3dhp']
 
@@ -448,6 +450,9 @@ class DataLoader(object):
         files_no3d = data_utils.get_all_files(self.dataset_dir, datasets_no3d)
         files_yes3d = data_utils.get_all_files(self.dataset_dir, datasets_yes3d)
 
+        rank_size = int(os.getenv('RANK_SIZE'))
+        rank_id = int(os.getenv('RANK_ID'))
+
         if len(files_yes3d) == 0:
             print("Dont run this without any datasets with gt 3d")
             import ipdb; ipdb.set_trace()
@@ -460,6 +465,8 @@ class DataLoader(object):
             cycle_length=10,
             block_length=1,
             num_parallel_calls = tf.data.experimental.AUTOTUNE)
+        if rank_size > 1 :
+            ds_yes3d = ds_yes3d.shard(rank_size, rank_id)
         options = tf.data.Options()
         options.experimental_threading.max_intra_op_parallelism = 1
         ds_yes3d = ds_yes3d.with_options(options)
@@ -480,6 +487,8 @@ class DataLoader(object):
                 cycle_length=10,
                 block_length=1,
                 num_parallel_calls = tf.data.experimental.AUTOTUNE)
+            if rank_size > 1:
+                ds_no3d = ds_no3d.shard(rank_size, rank_id)
             options = tf.data.Options()
             options.experimental_threading.max_intra_op_parallelism = 1
             ds_no3d = ds_no3d.with_options(options)
@@ -559,6 +568,10 @@ class DataLoader(object):
             cycle_length=10,
             block_length=1,
             num_parallel_calls = tf.data.experimental.AUTOTUNE)
+        rank_size = int(os.getenv('RANK_SIZE'))
+        rank_id = int(os.getenv('RANK_ID'))
+        if rank_size > 1 :
+            ds_smpl = ds_smpl.shard(rank_size, rank_id)
         options = tf.data.Options()
         options.experimental_threading.max_intra_op_parallelism = 1
         ds_smpl = ds_smpl.with_options(options)
diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py
index 78238d79362289ed1ad8ca275ae03bfed103454a..bd05cf5d1299d55c12a5f18b3c0e4144e83a104b 100644
--- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py
+++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py
@@ -53,6 +53,7 @@ import numpy as np
 
 from os.path import join, dirname
 import deepdish as dd
+import os
 
 # For drawing
 from util import renderer as vis_util
@@ -95,9 +96,9 @@ class HMRTrainer(object):
         # Data
         num_images = num_examples(config.datasets)
         num_mocap = num_examples(config.mocap_datasets)
-
+        # print("num_images====", num_images)
         self.num_itr_per_epoch = num_images / self.batch_size
-        self.num_itr_per_epoch_config = config.num_itr_per_epoch_config  #add
+        self.num_itr_per_epoch_config = config.num_itr_per_epoch_config  # add
         self.num_mocap_itr_per_epoch = num_mocap / self.batch_size
 
         # First make sure data_format is right
@@ -316,7 +317,7 @@ class HMRTrainer(object):
         if not self.encoder_only:
             with tf.name_scope("gather_d_loss"):
                 self.d_loss = self.d_loss_weight * (
-                    self.d_loss_real + self.d_loss_fake)
+                        self.d_loss_real + self.d_loss_fake)
 
         # For visualizations, only save selected few into:
         # B x T x ...
@@ -337,6 +338,8 @@ class HMRTrainer(object):
         print('Setting up optimizer..')
         d_optimizer = self.optimizer(self.d_lr)
         e_optimizer = self.optimizer(self.e_lr)
+        d_optimizer = npu_distributed_optimizer_wrapper(d_optimizer)
+        e_optimizer = npu_distributed_optimizer_wrapper(e_optimizer)
 
         self.e_opt = e_optimizer.minimize(
             self.e_loss, global_step=self.global_step, var_list=self.E_var)
@@ -435,12 +438,12 @@ class HMRTrainer(object):
         # Compute losses:
         with tf.name_scope("comp_d_loss"):
             self.d_loss_real = tf.reduce_mean(
-                tf.reduce_sum((self.d_out_real - 1)**2, axis=1))
+                tf.reduce_sum((self.d_out_real - 1) ** 2, axis=1))
             self.d_loss_fake = tf.reduce_mean(
-                tf.reduce_sum((self.d_out_fake)**2, axis=1))
+                tf.reduce_sum((self.d_out_fake) ** 2, axis=1))
             # Encoder loss
             self.e_loss_disc = tf.reduce_mean(
-                tf.reduce_sum((self.d_out_fake - 1)**2, axis=1))
+                tf.reduce_sum((self.d_out_fake - 1) ** 2, axis=1))
 
     def get_3d_loss(self, Rs, shape, Js):
         """
@@ -484,7 +487,7 @@ class HMRTrainer(object):
         Renderer is an instance of SMPLRenderer.
         """
         gt_vis = gt_kp[:, 2].astype(bool)
-        loss = np.sum((gt_kp[gt_vis, :2] - pred_kp[gt_vis])**2)
+        loss = np.sum((gt_kp[gt_vis, :2] - pred_kp[gt_vis]) ** 2)
         debug_text = {"sc": cam[0], "tx": cam[1], "ty": cam[2], "kpl": loss}
         # Fix a flength so i can render this with persp correct scale
         f = 5.
@@ -556,20 +559,30 @@ class HMRTrainer(object):
             img_size=self.img_size,
             face_path=self.config.smpl_face_path)
 
+        # print("111111111111111111",os.path.dirname(os.path.abspath(__file__)))
+
         step = 0
-        perf_list=[]
-        fps_list=[]
+        perf_list = []
+        fps_list = []
         # define sess config
         sess_config = tf.ConfigProto()
         custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add()
         custom_op.name = "NpuOptimizer"
         custom_op.parameter_map["use_off_line"].b = True
-        custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # mix precision
-        custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json")
+        custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")  # mix precision
+        custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("./ops_info.json")
+        custom_op.parameter_map["hcom_parallel"].b = True
         sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF  # close remap
         sess_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
 
         with self.sv.managed_session(config=sess_config) as sess:
+
+            sess.graph._unsafe_unfinalize()  # 取消最终确定Graph
+            rank_size = int(os.environ.get('RANK_SIZE', ''))
+            if rank_size > 1:
+                input = tf.trainable_variables()
+                bcast_global_variables_op = hccl_ops.broadcast(input, 0)
+                sess.run(bcast_global_variables_op)
             # Save graph.
             tf.io.write_graph(sess.graph, self.model_dir, 'graph.pbtxt', as_text=True)
             while not self.sv.should_stop():
@@ -607,61 +620,65 @@ class HMRTrainer(object):
                     })
                     if not self.encoder_only:
                         fetch_dict.update({
-                            "summary_occasional":
-                            self.summary_op_occ
+                            "summary_occasional": self.summary_op_occ
                         })
-
                 t0 = time()
                 result = sess.run(fetch_dict)
+
                 # compute metrics MPJPE and PA_MPJPE
                 MPJPE, PA_MPJPE = compute_errors_w_mask(
                     result["gt_joints"] * 1000.,
                     result["pred_joints"] * 1000.,
                     result["has_gt3d_joints"])
                 t1 = time()
-                
+
                 self.summary_writer.add_summary(
                     result['summary'], global_step=result['step'])
 
                 e_loss = result['e_loss']
                 step = result['step']
 
-                epoch = float(step) / (self.num_itr_per_epoch - self.num_itr_per_epoch_config)  #add  4031
+
+                # epoch = float(step) / (self.num_itr_per_epoch - self.num_itr_per_epoch_config)  #add  4031
+                epoch = float(step) / ((self.num_itr_per_epoch - self.num_itr_per_epoch_config) // rank_size)  # add  4031
                 if self.encoder_only:
                     print("itr %d/(epoch %.1f): time %g, Enc_loss: %.4f, MPJPE: %.1f, PA_MPJPE: %.1f" %
                           (step, epoch, t1 - t0, e_loss, MPJPE, PA_MPJPE))
                 else:
                     d_loss = result['d_loss']
                     if step > 2:
-                        perf = t1 - t0    #add
-                        fps = self.batch_size / perf   #add
+                        perf = t1 - t0  # add
+                        # fps = self.batch_size / perf   #add
+                        fps = rank_size * self.batch_size / perf  # add
                         perf_list.append(perf)
                         avg_perf = np.mean(perf_list)
                         fps_list.append(fps)
                         avg_fps = np.mean(fps_list)
-                        print(
-                            "itr %d/(epoch %.1f): time %g fps %.4f avg_perf %.4f avg_fps %.4f Enc_loss: %.4f  Disc_loss: %.4f  MPJPE: %.1f, PA_MPJPE: %.1f"
-                            % (step, epoch, perf, fps, avg_perf, avg_fps, e_loss, d_loss, MPJPE, PA_MPJPE))   #add
+                        print("itr %d/(epoch %.1f): time %g fps %.4f avg_perf %.4f avg_fps %.4f Enc_loss: %.4f  Disc_loss: %.4f  MPJPE: %.1f, PA_MPJPE: %.1f"
+                                % (step, epoch, perf, fps, avg_perf, avg_fps, e_loss, d_loss, MPJPE, PA_MPJPE))  # add
 
                 if step % self.log_img_step == 0:
                     if not self.encoder_only:
                         self.summary_writer.add_summary(
                             result['summary_occasional'],
                             global_step=result['step'])
-                    #self.draw_results(result)          #add
-                
-                if step % 5000 == 0:      #(5000)----------------------------------------
-                    print("******************model_dir************",self.model_dir)
+                    # self.draw_results(result)          #add
+
+                if step % 5000 == 0:  # (5000)----------------------------------------
+                    print("******************model_dir************", self.model_dir)
                     self.saver.save(
                         sess,
                         join(self.model_dir, 'model.ckpt'),
                         global_step=step
                     )
 
+                # print("d_lr=== %.4f , e_lr== %.4f " % (self.d_lr, self.e_lr))
+
                 self.summary_writer.flush()
                 if epoch > self.max_epoch:
                     self.sv.request_stop()
 
                 step += 1
 
-        print('Finish training on %s' % self.model_dir)
\ No newline at end of file
+
+        print('Finish training on %s' % self.model_dir)
diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json
new file mode 100644
index 0000000000000000000000000000000000000000..761d7d9928cd60b16ca8602498af492a12a89ac4
--- /dev/null
+++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json
@@ -0,0 +1,15 @@
+{
+"server_count":"1",
+"server_list":[{
+    "device":[{"device_id":"0","device_ip":"192.168.1.199","rank_id":"0"},
+              {"device_id":"1","device_ip":"192.168.1.198","rank_id":"1"},
+              {"device_id":"2","device_ip":"192.168.1.197","rank_id":"2"},
+              {"device_id":"3","device_ip":"192.168.1.196","rank_id":"3"},
+              {"device_id":"4","device_ip":"192.168.1.195","rank_id":"4"},
+              {"device_id":"5","device_ip":"192.168.1.194","rank_id":"5"},
+              {"device_id":"6","device_ip":"192.168.1.193","rank_id":"6"},
+              {"device_id":"7","device_ip":"192.168.1.192","rank_id":"7"}],
+    "server_id":"127.0.0.1"}],
+"status":"completed",
+"version":"1.0"
+}
diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ddee430ed017e6774e6e75b1d51956e9b2723011
--- /dev/null
+++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`/../
+
+#集合通信参数,不需要修改
+
+export RANK_SIZE=8
+export RANK_TABLE_FILE=$cur_path/test/8p.json
+export JOB_ID=10087
+RANK_ID_START=0
+ASCEND_DEVICE_ID_START=0
+
+#export LD_PRELOAD=/usr/lib64/libglapi.so.0
+# 数据集路径,保持为空,不需要修改
+data_path=''
+#预训练模型地址
+ckpt_path=''
+
+#设置默认日志级别,不需要改
+#export ASCEND_GLOBAL_LOG_LEVEL=3
+#export ASCEND_DEVICE_ID=4
+
+#基础参数，需要模型审视修改
+#网络名称，同目录名称
+Network="HMR_ID0783_for_TensorFlow"
+#训练epoch
+epochs=2
+#训练batch_size
+batch_size=64
+
+
+#TF2.X独有，需要模型审视修改
+export NPU_LOOP_SIZE=${train_steps}
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="allow_mix_precision"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+# 帮助信息，不需要修改
+if [[ $1 == --help || $1 == -h ]];then
+    echo"usage:./train_performance_1P.sh <args>"
+    echo " "
+    echo "parameter explain:
+    --precision_mode         precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+    --over_dump                    if or not over detection, default is False
+    --data_dump_flag                 data dump flag, default is False
+    --data_dump_step                 data dump step, default is 10
+    --profiling                    if or not profiling for performance debug, default is False
+    --data_path                    source data of training
+    --ckpt_path                         model
+    -h/--help                        show help message
+    "
+    exit 1
+fi
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    elif [[ $para == --over_dump* ]];then
+        over_dump=`echo ${para#*=}`
+        over_dump_path=${cur_path}/test/output/overflow_dump
+        mkdir -p ${over_dump_path}
+    elif [[ $para == --data_dump_flag* ]];then
+        data_dump_flag=`echo ${para#*=}`
+        data_dump_path=${cur_path}/test/output/data_dump
+        mkdir -p ${data_dump_path}
+    elif [[ $para == --data_dump_step* ]];then
+        data_dump_step=`echo ${para#*=}`
+    elif [[ $para == --profiling* ]];then
+        profiling=`echo ${para#*=}`
+        profiling_dump_path=${cur_path}/test/output/profiling
+        mkdir -p ${profiling_dump_path}
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --ckpt_path* ]];then
+        ckpt_path=`echo ${para#*=}`
+        fi
+done
+# #校验是否传入data_path,不需要修改
+# if [[$data_path == ""]];then
+#     echo "[Error] para \"data_path\" must be confing"
+#     exit 1
+# fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/src
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    #设置环境变量，不需要修改
+    echo "Device ID: $RANK_ID"
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    ASCEND_DEVICE_ID=$RANK_ID
+
+    #创建DeviceID输出目录，不需要修改
+    if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID}
+        mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt
+    else
+        mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt
+    fi
+
+#--d_lr 0.0008 1e-4
+#--e_lr 0.00008 1e-5
+    nohup python3.7.5 main.py \
+        --d_lr 0.0008 \
+        --e_lr 0.00008 \
+        --log_img_step 100 \
+        --pretrained_model_path=${ckpt_path}/resnet_v2_50.ckpt \
+        --smpl_model_path=${ckpt_path}/neutral_smpl_with_cocoplus_reg.pkl \
+        --smpl_face_path=${ckpt_path}/smpl_faces.npy \
+        --data_dir ${data_path} \
+        --e_loss_weight 60. \
+        --batch_size=64 \
+        --use_3d_label True \
+        --e_3d_weight 60. \
+        --datasets lsp,lsp_ext,mpii,coco,mpi_inf_3dhp \
+        --epoch 50 \
+        --log_dir ${cur_path}/logs/${ASCEND_DEVICE_ID} \
+        --num_itr_per_epoch_config  0 > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+wait
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    #设置环境变量，不需要修改
+    echo "Device ID: $RANK_ID"
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    ASCEND_DEVICE_ID=$RANK_ID
+
+    nohup python3.7.5  eval.py \
+        --load_path=${cur_path}/logs/${ASCEND_DEVICE_ID}/model.ckpt-25000 \
+        --smpl_model_path=${ckpt_path}/neutral_smpl_with_cocoplus_reg.pkl \
+        --eval_data_dir=${data_path}/mpi_inf_3dhp/test  > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+TrainingTime=`grep 'avg_perf' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $9}'`
+FPS=`grep 'avg_fps' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $11}'`
+#打印，不需要修改
+echo "Final Performance TrainingTime : $TrainingTime"
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep Metrics: $cur_path/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log|awk '{print $3}'`
+
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${FPS}'/69}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep 'Enc_loss:' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk  '{print $13}' >> $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需修改
+echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh
index f9bdff60f277b502785367d24ebfc96802215fdf..b58719f746cffdc90a8e8308fc7ff184ee2c1979 100644
--- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh
+++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh
@@ -92,6 +92,7 @@ start_time=$(date +%s)
 
 #进入训练脚本目录，需要模型审视修改
 cd $cur_path/src
+export RANK_ID=$RANK_ID_START
 
 #创建DeviceID输出目录，不需要修改
 if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then