diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py index 9494faa52a64a35667665871d1ac5f3eb4c08156..1b147b39602045c8c04962f5c0ac010824a54a17 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py @@ -32,6 +32,7 @@ Data loader with data augmentation. Only used for training. """ from __future__ import absolute_import +from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -42,6 +43,7 @@ import tensorflow as tf from tf_smpl.batch_lbs import batch_rodrigues from util import data_utils +import os _3D_DATASETS = ['h36m', 'up', 'mpi_inf_3dhp'] @@ -448,6 +450,9 @@ class DataLoader(object): files_no3d = data_utils.get_all_files(self.dataset_dir, datasets_no3d) files_yes3d = data_utils.get_all_files(self.dataset_dir, datasets_yes3d) + rank_size = int(os.getenv('RANK_SIZE')) + rank_id = int(os.getenv('RANK_ID')) + if len(files_yes3d) == 0: print("Dont run this without any datasets with gt 3d") import ipdb; ipdb.set_trace() @@ -460,6 +465,8 @@ class DataLoader(object): cycle_length=10, block_length=1, num_parallel_calls = tf.data.experimental.AUTOTUNE) + if rank_size > 1 : + ds_yes3d = ds_yes3d.shard(rank_size, rank_id) options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 ds_yes3d = ds_yes3d.with_options(options) @@ -480,6 +487,8 @@ class DataLoader(object): cycle_length=10, block_length=1, num_parallel_calls = tf.data.experimental.AUTOTUNE) + if rank_size > 1: + ds_no3d = ds_no3d.shard(rank_size, rank_id) options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 ds_no3d = ds_no3d.with_options(options) @@ -559,6 +568,10 @@ class DataLoader(object): cycle_length=10, block_length=1, num_parallel_calls = tf.data.experimental.AUTOTUNE) + rank_size = int(os.getenv('RANK_SIZE')) + rank_id = int(os.getenv('RANK_ID')) + if rank_size > 1 : + ds_smpl = ds_smpl.shard(rank_size, rank_id) options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 ds_smpl = ds_smpl.with_options(options) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py index 78238d79362289ed1ad8ca275ae03bfed103454a..bd05cf5d1299d55c12a5f18b3c0e4144e83a104b 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py @@ -53,6 +53,7 @@ import numpy as np from os.path import join, dirname import deepdish as dd +import os # For drawing from util import renderer as vis_util @@ -95,9 +96,9 @@ class HMRTrainer(object): # Data num_images = num_examples(config.datasets) num_mocap = num_examples(config.mocap_datasets) - + # print("num_images====", num_images) self.num_itr_per_epoch = num_images / self.batch_size - self.num_itr_per_epoch_config = config.num_itr_per_epoch_config #add + self.num_itr_per_epoch_config = config.num_itr_per_epoch_config # add self.num_mocap_itr_per_epoch = num_mocap / self.batch_size # First make sure data_format is right @@ -316,7 +317,7 @@ class HMRTrainer(object): if not self.encoder_only: with tf.name_scope("gather_d_loss"): self.d_loss = self.d_loss_weight * ( - self.d_loss_real + self.d_loss_fake) + self.d_loss_real + self.d_loss_fake) # For visualizations, only save selected few into: # B x T x ... @@ -337,6 +338,8 @@ class HMRTrainer(object): print('Setting up optimizer..') d_optimizer = self.optimizer(self.d_lr) e_optimizer = self.optimizer(self.e_lr) + d_optimizer = npu_distributed_optimizer_wrapper(d_optimizer) + e_optimizer = npu_distributed_optimizer_wrapper(e_optimizer) self.e_opt = e_optimizer.minimize( self.e_loss, global_step=self.global_step, var_list=self.E_var) @@ -435,12 +438,12 @@ class HMRTrainer(object): # Compute losses: with tf.name_scope("comp_d_loss"): self.d_loss_real = tf.reduce_mean( - tf.reduce_sum((self.d_out_real - 1)**2, axis=1)) + tf.reduce_sum((self.d_out_real - 1) ** 2, axis=1)) self.d_loss_fake = tf.reduce_mean( - tf.reduce_sum((self.d_out_fake)**2, axis=1)) + tf.reduce_sum((self.d_out_fake) ** 2, axis=1)) # Encoder loss self.e_loss_disc = tf.reduce_mean( - tf.reduce_sum((self.d_out_fake - 1)**2, axis=1)) + tf.reduce_sum((self.d_out_fake - 1) ** 2, axis=1)) def get_3d_loss(self, Rs, shape, Js): """ @@ -484,7 +487,7 @@ class HMRTrainer(object): Renderer is an instance of SMPLRenderer. """ gt_vis = gt_kp[:, 2].astype(bool) - loss = np.sum((gt_kp[gt_vis, :2] - pred_kp[gt_vis])**2) + loss = np.sum((gt_kp[gt_vis, :2] - pred_kp[gt_vis]) ** 2) debug_text = {"sc": cam[0], "tx": cam[1], "ty": cam[2], "kpl": loss} # Fix a flength so i can render this with persp correct scale f = 5. @@ -556,20 +559,30 @@ class HMRTrainer(object): img_size=self.img_size, face_path=self.config.smpl_face_path) + # print("111111111111111111",os.path.dirname(os.path.abspath(__file__))) + step = 0 - perf_list=[] - fps_list=[] + perf_list = [] + fps_list = [] # define sess config sess_config = tf.ConfigProto() custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True - custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # mix precision - custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # mix precision + custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("./ops_info.json") + custom_op.parameter_map["hcom_parallel"].b = True sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # close remap sess_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF with self.sv.managed_session(config=sess_config) as sess: + + sess.graph._unsafe_unfinalize() # 取消最终确定Graph + rank_size = int(os.environ.get('RANK_SIZE', '')) + if rank_size > 1: + input = tf.trainable_variables() + bcast_global_variables_op = hccl_ops.broadcast(input, 0) + sess.run(bcast_global_variables_op) # Save graph. tf.io.write_graph(sess.graph, self.model_dir, 'graph.pbtxt', as_text=True) while not self.sv.should_stop(): @@ -607,61 +620,65 @@ class HMRTrainer(object): }) if not self.encoder_only: fetch_dict.update({ - "summary_occasional": - self.summary_op_occ + "summary_occasional": self.summary_op_occ }) - t0 = time() result = sess.run(fetch_dict) + # compute metrics MPJPE and PA_MPJPE MPJPE, PA_MPJPE = compute_errors_w_mask( result["gt_joints"] * 1000., result["pred_joints"] * 1000., result["has_gt3d_joints"]) t1 = time() - + self.summary_writer.add_summary( result['summary'], global_step=result['step']) e_loss = result['e_loss'] step = result['step'] - epoch = float(step) / (self.num_itr_per_epoch - self.num_itr_per_epoch_config) #add 4031 + + # epoch = float(step) / (self.num_itr_per_epoch - self.num_itr_per_epoch_config) #add 4031 + epoch = float(step) / ((self.num_itr_per_epoch - self.num_itr_per_epoch_config) // rank_size) # add 4031 if self.encoder_only: print("itr %d/(epoch %.1f): time %g, Enc_loss: %.4f, MPJPE: %.1f, PA_MPJPE: %.1f" % (step, epoch, t1 - t0, e_loss, MPJPE, PA_MPJPE)) else: d_loss = result['d_loss'] if step > 2: - perf = t1 - t0 #add - fps = self.batch_size / perf #add + perf = t1 - t0 # add + # fps = self.batch_size / perf #add + fps = rank_size * self.batch_size / perf # add perf_list.append(perf) avg_perf = np.mean(perf_list) fps_list.append(fps) avg_fps = np.mean(fps_list) - print( - "itr %d/(epoch %.1f): time %g fps %.4f avg_perf %.4f avg_fps %.4f Enc_loss: %.4f Disc_loss: %.4f MPJPE: %.1f, PA_MPJPE: %.1f" - % (step, epoch, perf, fps, avg_perf, avg_fps, e_loss, d_loss, MPJPE, PA_MPJPE)) #add + print("itr %d/(epoch %.1f): time %g fps %.4f avg_perf %.4f avg_fps %.4f Enc_loss: %.4f Disc_loss: %.4f MPJPE: %.1f, PA_MPJPE: %.1f" + % (step, epoch, perf, fps, avg_perf, avg_fps, e_loss, d_loss, MPJPE, PA_MPJPE)) # add if step % self.log_img_step == 0: if not self.encoder_only: self.summary_writer.add_summary( result['summary_occasional'], global_step=result['step']) - #self.draw_results(result) #add - - if step % 5000 == 0: #(5000)---------------------------------------- - print("******************model_dir************",self.model_dir) + # self.draw_results(result) #add + + if step % 5000 == 0: # (5000)---------------------------------------- + print("******************model_dir************", self.model_dir) self.saver.save( sess, join(self.model_dir, 'model.ckpt'), global_step=step ) + # print("d_lr=== %.4f , e_lr== %.4f " % (self.d_lr, self.e_lr)) + self.summary_writer.flush() if epoch > self.max_epoch: self.sv.request_stop() step += 1 - print('Finish training on %s' % self.model_dir) \ No newline at end of file + + print('Finish training on %s' % self.model_dir) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json new file mode 100644 index 0000000000000000000000000000000000000000..761d7d9928cd60b16ca8602498af492a12a89ac4 --- /dev/null +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json @@ -0,0 +1,15 @@ +{ +"server_count":"1", +"server_list":[{ + "device":[{"device_id":"0","device_ip":"192.168.1.199","rank_id":"0"}, + {"device_id":"1","device_ip":"192.168.1.198","rank_id":"1"}, + {"device_id":"2","device_ip":"192.168.1.197","rank_id":"2"}, + {"device_id":"3","device_ip":"192.168.1.196","rank_id":"3"}, + {"device_id":"4","device_ip":"192.168.1.195","rank_id":"4"}, + {"device_id":"5","device_ip":"192.168.1.194","rank_id":"5"}, + {"device_id":"6","device_ip":"192.168.1.193","rank_id":"6"}, + {"device_id":"7","device_ip":"192.168.1.192","rank_id":"7"}], + "server_id":"127.0.0.1"}], +"status":"completed", +"version":"1.0" +} diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..ddee430ed017e6774e6e75b1d51956e9b2723011 --- /dev/null +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh @@ -0,0 +1,197 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd`/../ + +#集合通信参数,不需要修改 + +export RANK_SIZE=8 +export RANK_TABLE_FILE=$cur_path/test/8p.json +export JOB_ID=10087 +RANK_ID_START=0 +ASCEND_DEVICE_ID_START=0 + +#export LD_PRELOAD=/usr/lib64/libglapi.so.0 +# 数据集路径,保持为空,不需要修改 +data_path='' +#预训练模型地址 +ckpt_path='' + +#设置默认日志级别,不需要改 +#export ASCEND_GLOBAL_LOG_LEVEL=3 +#export ASCEND_DEVICE_ID=4 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="HMR_ID0783_for_TensorFlow" +#训练epoch +epochs=2 +#训练batch_size +batch_size=64 + + +#TF2.X独有,需要模型审视修改 +export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + --ckpt_path model + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/test/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/test/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/test/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + fi +done +# #校验是否传入data_path,不需要修改 +# if [[$data_path == ""]];then +# echo "[Error] para \"data_path\" must be confing" +# exit 1 +# fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/src + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt + fi + +#--d_lr 0.0008 1e-4 +#--e_lr 0.00008 1e-5 + nohup python3.7.5 main.py \ + --d_lr 0.0008 \ + --e_lr 0.00008 \ + --log_img_step 100 \ + --pretrained_model_path=${ckpt_path}/resnet_v2_50.ckpt \ + --smpl_model_path=${ckpt_path}/neutral_smpl_with_cocoplus_reg.pkl \ + --smpl_face_path=${ckpt_path}/smpl_faces.npy \ + --data_dir ${data_path} \ + --e_loss_weight 60. \ + --batch_size=64 \ + --use_3d_label True \ + --e_3d_weight 60. \ + --datasets lsp,lsp_ext,mpii,coco,mpi_inf_3dhp \ + --epoch 50 \ + --log_dir ${cur_path}/logs/${ASCEND_DEVICE_ID} \ + --num_itr_per_epoch_config 0 > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + nohup python3.7.5 eval.py \ + --load_path=${cur_path}/logs/${ASCEND_DEVICE_ID}/model.ckpt-25000 \ + --smpl_model_path=${ckpt_path}/neutral_smpl_with_cocoplus_reg.pkl \ + --eval_data_dir=${data_path}/mpi_inf_3dhp/test > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +TrainingTime=`grep 'avg_perf' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $9}'` +FPS=`grep 'avg_fps' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $11}'` +#打印,不需要修改 +echo "Final Performance TrainingTime : $TrainingTime" +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep Metrics: $cur_path/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log|awk '{print $3}'` + +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${FPS}'/69}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep 'Enc_loss:' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $13}' >> $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh index f9bdff60f277b502785367d24ebfc96802215fdf..b58719f746cffdc90a8e8308fc7ff184ee2c1979 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh @@ -92,6 +92,7 @@ start_time=$(date +%s) #进入训练脚本目录,需要模型审视修改 cd $cur_path/src +export RANK_ID=$RANK_ID_START #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then