diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/base.py b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/base.py index c7ff3a36ae99ed24f495d6e80607403c3d84a19c..87bc7b8d35a9470fdcf172f15198e92af1a4b1aa 100644 --- a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/base.py +++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/base.py @@ -26,8 +26,9 @@ # See the License for the specific language governing permissions and # limitations under the License. - from npu_bridge.npu_init import * +from npu_bridge.hccl import hccl_ops + import tensorflow as tf import tensorflow.contrib.slim as slim import numpy as np @@ -45,6 +46,8 @@ from .timer import Timer from .logger import colorlogger from .utils import approx_equal +rank_size = int(os.getenv('RANK_SIZE')) + class ModelDesc(object): __metaclass__ = abc.ABCMeta def __init__(self): @@ -148,6 +151,19 @@ class Base(object): # initialize tensorflow tfconfig = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) tfconfig.gpu_options.allow_growth = True + + #############npu modify start############### + custom_op = tfconfig.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True + # + if int(rank_size) > 1: + custom_op.parameter_map["hcom_parallel"].b = True + # + tfconfig.graph_options.rewrite_options.remapping = RewriterConfig.OFF # off remap + tfconfig.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + #############npu modify end############### + self.sess = tf.Session(config=npu_config_proto(config_proto=tfconfig)) # build_graph @@ -240,8 +256,13 @@ class Trainer(Base): data_load_thread.reset_state() dataiter = data_load_thread.get_data() + + if int(rank_size) > 1: + itr_per_epoch = math.ceil(len(train_data)/self.cfg.batch_size/self.cfg.num_gpus/rank_size) + else: + itr_per_epoch = math.ceil(len(train_data)/self.cfg.batch_size/self.cfg.num_gpus) - return dataiter, math.ceil(len(train_data)/self.cfg.batch_size/self.cfg.num_gpus) + return dataiter, itr_per_epoch def _make_graph(self): self.logger.info("Generating training graph on {} GPUs ...".format(self.cfg.num_gpus)) @@ -310,6 +331,12 @@ class Trainer(Base): self.sess.run(tf.variables_initializer(tf.global_variables(), name='init')) self.load_weights('last_epoch' if self.cfg.continue_train else self.cfg.init_model) + rank_size = int(os.getenv('RANK_SIZE')) + if int(rank_size) > 1: + input = tf.trainable_variables() + bcast_global_variables_op = hccl_ops.broadcast(input, 0) + self.sess.run(bcast_global_variables_op) + self.logger.info('Start training ...') start_itr = self.cur_epoch * self.itr_per_epoch + 1 end_itr = self.itr_per_epoch * self.cfg.end_epoch + 1 diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/data_provider.py b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/data_provider.py index db62e5fd56a31ff7261f47dfacf42182602afd5c..bf21411e98ed57aaa199edb5d5a93a5e18988f3c 100644 --- a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/data_provider.py +++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/data_provider.py @@ -355,10 +355,22 @@ class BatchData(object): Yields: Batched data by stacking each component on an extra 0th dimension. """ + # shard by rankid + rank_size = int(os.getenv('RANK_SIZE')) + if int(rank_size) > 1: + rank_id = int(os.getenv('RANK_ID')) + else : + rank_id = 0 + + i = 0 holder = [] for data in self.ds.get_data(): - holder.append(data) + # holder.append(data) + if (i // self.batch_size) == rank_id: + holder.append(data) + i += 1 if len(holder) == self.batch_size: + i = 0 yield BatchData._aggregate_batch(holder, self.use_list) del holder[:] diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/net_utils.py b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/net_utils.py index ea0df01e1cd41158ac6a1038fab6578aef67c4a8..918718cbb4a4b0970f5722c6ba40343e59fb4c82 100644 --- a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/net_utils.py +++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/lib/tfflat/net_utils.py @@ -129,6 +129,7 @@ def get_optimizer(lr, optimizer='momentum'): optimizer = tf.train.AdamOptimizer(lr) else: raise ValueError('invalid optimizer') + optimizer = npu_distributed_optimizer_wrapper(optimizer) return optimizer def get_tower_summary_dict(summary): diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/main/config.py b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/main/config.py index 22ab2590283bc8a1cfe64737bbb63ca828fb5bdb..51a29b347c13da890f59a6c84a2e1420a3bb977b 100644 --- a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/main/config.py +++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/src/main/config.py @@ -75,6 +75,7 @@ class Config: pixel_means = np.array([[[123.68, 116.78, 103.94]]]) ## training config + rank_size = int(os.getenv('RANK_SIZE')) lr_dec_epoch = [90, 120] end_epoch = 140 lr = 5e-4 diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..b3fcb2e67f473bd65f187ba71bb528686fbc1695 --- /dev/null +++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_full_8p.sh @@ -0,0 +1,128 @@ +#!/bin/bash + + +export RANK_SIZE=1 +export JOB_ID=10087 +export RANK_ID_START=0 +export RANK_TABLE_FILE=/hdu/zhengleilei/SimpleHumanPose_ID0956_for_TensorFlow_1P/configs/rank_table_8p.json + + + +cur_path=`pwd` +data_path="" +ckpt_path="" +Network="SimpleHumanPose_ID0956_for_TensorFlow" +#batch_size=32 +batch_size=256 +epoch=140 +# train_performance_1p.sh perf +# train_full_1p.sh acc +CaseName="${Network}_bs${batch_size}_${RANK_SIZE}p_acc" + + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + echo "${data_path}" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + echo "${ckpt_path}" + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + echo "${batch_size}" + elif [[ $para == --max_steps* ]];then + max_steps=`echo ${para#*=}` + echo "${max_steps}" + fi +done +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +cd $cur_path/../ +rm -rf ${data_path}/model_dump/COCO/* +# CHANGE PARM +# data_path +sed -i "72a\ \ \ \ data_path = \'$data_path\'" src/data/COCO/dataset.py +sed -i "53a\ \ \ \ data_path = \'$data_path\'" src/main/config.py +# end_epoch +# sed -i "s/end_epoch = 140/end_epoch = 1/g" src/main/config.py +# sed -i "s/test_epoch = 140/test_epoch = 1/g" src/main/test_my.py +# START +start_time=$(date +%s) +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + DEVICE_INDEX=$RANK_ID + export DEVICE_INDEX=${DEVICE_INDEX} + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt + else + mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt + fi + # train + nohup python3 -u src/main/train.py > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + wait + # eval + mkdir -p ${data_path}/model_dump/COCO/ + mv cache/result/model_dump/COCO/snapshot_${epoch}.ckpt* ${data_path}/model_dump/COCO/ + echo "mv cache/result/model_dump/COCO/snapshot_${epoch}.ckpt* ${data_path}/model_dump/COCO/" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + nohup python3 -u src/main/test_my.py >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +end_time=$(date +%s) +e2e_time=$(( ${end_time} - ${start_time} )) +# sed -i "s/end_epoch = 1/end_epoch = 140/g" src/main/config.py +# sed -i "s/test_epoch = 1/test_epoch = 140/g" src/main/test_my.py +rm -rf ${data_path}/model_dump/COCO/* + + +echo "------------------ Final result ------------------" +BatchSize=${batch_size} +DeviceType=`uname -m` +# getFPS +Time=`grep loss ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Epoch 0' | awk -F's/itr ' 'END{print $2}' | awk -F'h' '{print 60*60*$1}'` +FPS=`awk 'BEGIN{printf "%.4f\n",'${batch_size}'*4682/'${Time}'}'` +ActualFPS=${FPS} +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*4682/'${FPS}'}'` +# getAcc +train_accuracy=`grep 'IoU' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F'] = ' '{print $2}' | head -n 1` +# getLoss +grep 'loss' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep Epoch | awk -F'loss: ' '{print $2}' > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt +ActualLoss=`awk 'END {print}' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` +echo "Final Performance images/sec : ${FPS}" +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : ${e2e_time}" + + +echo "Network = ${Network}" > ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log diff --git a/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..be83794d8b98518e93d1d57ba72d6ae1f4ed62cb --- /dev/null +++ b/TensorFlow/contrib/cv/SimpleHumanPose_ID0956_for_TensorFlow/test/train_performance_8p.sh @@ -0,0 +1,122 @@ +#!/bin/bash + + +export RANK_SIZE=8 +export JOB_ID=10087 +export RANK_ID_START=0 +export RANK_TABLE_FILE=/hdu/zhengleilei/SimpleHumanPose_ID0956_for_TensorFlow_1P/configs/rank_table_8p.json + + + +cur_path=`pwd` +data_path="" +ckpt_path="" +Network="SimpleHumanPose_ID0956_for_TensorFlow" +#batch_size=32 +batch_size=256 +epoch=1 +# train_performance_1p.sh perf +# train_full_1p.sh acc +CaseName="${Network}_bs${batch_size}_${RANK_SIZE}p_perf" + + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + echo "${data_path}" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + echo "${ckpt_path}" + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + echo "${batch_size}" + elif [[ $para == --max_steps* ]];then + max_steps=`echo ${para#*=}` + echo "${max_steps}" + fi +done +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +cd $cur_path/../ +# CHANGE PARM +# data_path +sed -i "72a\ \ \ \ data_path = \'$data_path\'" src/data/COCO/dataset.py +sed -i "52a\ \ \ \ data_path = \'$data_path\'" src/main/config.py +# end_epoch +sed -i "s/end_epoch = 140/end_epoch = 1/g" src/main/config.py +# START +start_time=$(date +%s) +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + DEVICE_INDEX=$RANK_ID + export DEVICE_INDEX=${DEVICE_INDEX} + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt + else + mkdir -p ${cur_path}/output/${ASCEND_DEVICE_ID}/ckpt + fi + nohup python3 -u src/main/train.py > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + # wait + # eval + # cp -r cache/result/model_dump ${data_path}/ + # wait + # nohup python3 -u main/test_my.py >> > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +end_time=$(date +%s) +e2e_time=$(( ${end_time} - ${start_time} )) +sed -i "s/end_epoch = 1/end_epoch = 140/g" src/main/config.py + + +echo "------------------ Final result ------------------" +BatchSize=${batch_size} +DeviceType=`uname -m` +# getFPS +Time=`grep loss ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Epoch 0' | awk -F's/itr ' 'END{print $2}' | awk -F'h' '{print 60*60*$1}'` +FPS=`awk 'BEGIN{printf "%.4f\n",'${batch_size}'*4682/'${Time}'}'` +ActualFPS=${FPS} +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*4682/'${FPS}'}'` +# getAcc +train_accuracy="None" +# getLoss +grep 'loss' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep Epoch | awk -F'loss: ' '{print $2}' > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt +ActualLoss=`awk 'END {print}' ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` +echo "Final Performance images/sec : ${FPS}" +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : ${e2e_time}" + + +echo "Network = ${Network}" > ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/${ASCEND_DEVICE_ID}/${CaseName}.log