From 493b215932b3d48a158f480e18ab6bf1dc918eb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 01:49:20 +0000 Subject: [PATCH 01/11] add TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json. --- .../cv/HMR_ID0783_for_TensorFlow/test/8p.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json new file mode 100644 index 000000000..761d7d992 --- /dev/null +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/8p.json @@ -0,0 +1,15 @@ +{ +"server_count":"1", +"server_list":[{ + "device":[{"device_id":"0","device_ip":"192.168.1.199","rank_id":"0"}, + {"device_id":"1","device_ip":"192.168.1.198","rank_id":"1"}, + {"device_id":"2","device_ip":"192.168.1.197","rank_id":"2"}, + {"device_id":"3","device_ip":"192.168.1.196","rank_id":"3"}, + {"device_id":"4","device_ip":"192.168.1.195","rank_id":"4"}, + {"device_id":"5","device_ip":"192.168.1.194","rank_id":"5"}, + {"device_id":"6","device_ip":"192.168.1.193","rank_id":"6"}, + {"device_id":"7","device_ip":"192.168.1.192","rank_id":"7"}], + "server_id":"127.0.0.1"}], +"status":"completed", +"version":"1.0" +} -- Gitee From eaff46afb6c193f500a0eded705312d278383254 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 01:50:00 +0000 Subject: [PATCH 02/11] add TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh. --- .../test/train_full_8p.sh | 197 ++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh new file mode 100644 index 000000000..f067fdd26 --- /dev/null +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh @@ -0,0 +1,197 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd`/../ + +#集合通信参数,不需要修改 + +export RANK_SIZE=8 +export RANK_TABLE_FILE=$cur_path/test/8p.json +export JOB_ID=10087 +RANK_ID_START=0 +ASCEND_DEVICE_ID_START=0 + +#export LD_PRELOAD=/usr/lib64/libglapi.so.0 +# 数据集路径,保持为空,不需要修改 +data_path='' +#预训练模型地址 +ckpt_path='' + +#设置默认日志级别,不需要改 +#export ASCEND_GLOBAL_LOG_LEVEL=3 +#export ASCEND_DEVICE_ID=4 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="HMR_ID0783_for_TensorFlow" +#训练epoch +epochs=2 +#训练batch_size +batch_size=64 + + +#TF2.X独有,需要模型审视修改 +export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + --ckpt_path model + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/test/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/test/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/test/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + fi +done +# #校验是否传入data_path,不需要修改 +# if [[$data_path == ""]];then +# echo "[Error] para \"data_path\" must be confing" +# exit 1 +# fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/src + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt + fi + +#--d_lr 0.0008 1e-4 +#--e_lr 0.00008 1e-5 + nohup python3.7.5 main.py \ + --d_lr 1e-4 \ + --e_lr 1e-5 \ + --log_img_step 100 \ + --pretrained_model_path=${ckpt_path}/resnet_v2_50.ckpt \ + --smpl_model_path=${ckpt_path}/neutral_smpl_with_cocoplus_reg.pkl \ + --smpl_face_path=${ckpt_path}/smpl_faces.npy \ + --data_dir ${data_path} \ + --e_loss_weight 60. \ + --batch_size=64 \ + --use_3d_label True \ + --e_3d_weight 60. \ + --datasets lsp,lsp_ext,mpii,coco,mpi_inf_3dhp \ + --epoch 50 \ + --log_dir ${cur_path}/logs \ + --num_itr_per_epoch_config 0 > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + nohup python3.7.5 eval.py \ + --load_path=${cur_path}/logs/model.ckpt-25000 \ + --smpl_model_path=${ckpt_path}/neutral_smpl_with_cocoplus_reg.pkl \ + --eval_data_dir=${data_path}/mpi_inf_3dhp/test > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +TrainingTime=`grep 'avg_perf' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $9}'` +FPS=`grep 'avg_fps' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $11}'` +#打印,不需要修改 +echo "Final Performance TrainingTime : $TrainingTime" +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep Metrics: $cur_path/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log|awk '{print $3}'` + +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${FPS}'/69}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep 'Enc_loss:' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $13}' >> $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 727033cc1cba1498addc0392a1ff9cfd7d501b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 01:52:59 +0000 Subject: [PATCH 03/11] update --- .../HMR_ID0783_for_TensorFlow/src/trainer.py | 67 ++++++++++++------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py index 78238d793..c05795181 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py @@ -53,6 +53,7 @@ import numpy as np from os.path import join, dirname import deepdish as dd +import os # For drawing from util import renderer as vis_util @@ -95,9 +96,9 @@ class HMRTrainer(object): # Data num_images = num_examples(config.datasets) num_mocap = num_examples(config.mocap_datasets) - + print("num_images====", num_images) self.num_itr_per_epoch = num_images / self.batch_size - self.num_itr_per_epoch_config = config.num_itr_per_epoch_config #add + self.num_itr_per_epoch_config = config.num_itr_per_epoch_config # add self.num_mocap_itr_per_epoch = num_mocap / self.batch_size # First make sure data_format is right @@ -316,7 +317,7 @@ class HMRTrainer(object): if not self.encoder_only: with tf.name_scope("gather_d_loss"): self.d_loss = self.d_loss_weight * ( - self.d_loss_real + self.d_loss_fake) + self.d_loss_real + self.d_loss_fake) # For visualizations, only save selected few into: # B x T x ... @@ -337,6 +338,8 @@ class HMRTrainer(object): print('Setting up optimizer..') d_optimizer = self.optimizer(self.d_lr) e_optimizer = self.optimizer(self.e_lr) + d_optimizer = npu_distributed_optimizer_wrapper(d_optimizer) + e_optimizer = npu_distributed_optimizer_wrapper(e_optimizer) self.e_opt = e_optimizer.minimize( self.e_loss, global_step=self.global_step, var_list=self.E_var) @@ -435,12 +438,12 @@ class HMRTrainer(object): # Compute losses: with tf.name_scope("comp_d_loss"): self.d_loss_real = tf.reduce_mean( - tf.reduce_sum((self.d_out_real - 1)**2, axis=1)) + tf.reduce_sum((self.d_out_real - 1) ** 2, axis=1)) self.d_loss_fake = tf.reduce_mean( - tf.reduce_sum((self.d_out_fake)**2, axis=1)) + tf.reduce_sum((self.d_out_fake) ** 2, axis=1)) # Encoder loss self.e_loss_disc = tf.reduce_mean( - tf.reduce_sum((self.d_out_fake - 1)**2, axis=1)) + tf.reduce_sum((self.d_out_fake - 1) ** 2, axis=1)) def get_3d_loss(self, Rs, shape, Js): """ @@ -484,7 +487,7 @@ class HMRTrainer(object): Renderer is an instance of SMPLRenderer. """ gt_vis = gt_kp[:, 2].astype(bool) - loss = np.sum((gt_kp[gt_vis, :2] - pred_kp[gt_vis])**2) + loss = np.sum((gt_kp[gt_vis, :2] - pred_kp[gt_vis]) ** 2) debug_text = {"sc": cam[0], "tx": cam[1], "ty": cam[2], "kpl": loss} # Fix a flength so i can render this with persp correct scale f = 5. @@ -557,19 +560,27 @@ class HMRTrainer(object): face_path=self.config.smpl_face_path) step = 0 - perf_list=[] - fps_list=[] + perf_list = [] + fps_list = [] # define sess config sess_config = tf.ConfigProto() custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True - custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # mix precision - custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # mix precision + custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("src/ops_info.json") + custom_op.parameter_map["hcom_parallel"].b = True sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # close remap sess_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF with self.sv.managed_session(config=sess_config) as sess: + + sess.graph._unsafe_unfinalize() # 取消最终确定Graph + rank_size = int(os.environ.get('RANK_SIZE', '')) + if rank_size > 1: + input = tf.trainable_variables() + bcast_global_variables_op = hccl_ops.broadcast(input, 0) + sess.run(bcast_global_variables_op) # Save graph. tf.io.write_graph(sess.graph, self.model_dir, 'graph.pbtxt', as_text=True) while not self.sv.should_stop(): @@ -607,61 +618,65 @@ class HMRTrainer(object): }) if not self.encoder_only: fetch_dict.update({ - "summary_occasional": - self.summary_op_occ + "summary_occasional": self.summary_op_occ }) - t0 = time() result = sess.run(fetch_dict) + # compute metrics MPJPE and PA_MPJPE MPJPE, PA_MPJPE = compute_errors_w_mask( result["gt_joints"] * 1000., result["pred_joints"] * 1000., result["has_gt3d_joints"]) t1 = time() - + self.summary_writer.add_summary( result['summary'], global_step=result['step']) e_loss = result['e_loss'] step = result['step'] - epoch = float(step) / (self.num_itr_per_epoch - self.num_itr_per_epoch_config) #add 4031 + + # epoch = float(step) / (self.num_itr_per_epoch - self.num_itr_per_epoch_config) #add 4031 + epoch = float(step) / ((self.num_itr_per_epoch - self.num_itr_per_epoch_config) // rank_size) # add 4031 if self.encoder_only: print("itr %d/(epoch %.1f): time %g, Enc_loss: %.4f, MPJPE: %.1f, PA_MPJPE: %.1f" % (step, epoch, t1 - t0, e_loss, MPJPE, PA_MPJPE)) else: d_loss = result['d_loss'] if step > 2: - perf = t1 - t0 #add - fps = self.batch_size / perf #add + perf = t1 - t0 # add + # fps = self.batch_size / perf #add + fps = rank_size * self.batch_size / perf # add perf_list.append(perf) avg_perf = np.mean(perf_list) fps_list.append(fps) avg_fps = np.mean(fps_list) - print( - "itr %d/(epoch %.1f): time %g fps %.4f avg_perf %.4f avg_fps %.4f Enc_loss: %.4f Disc_loss: %.4f MPJPE: %.1f, PA_MPJPE: %.1f" - % (step, epoch, perf, fps, avg_perf, avg_fps, e_loss, d_loss, MPJPE, PA_MPJPE)) #add + print("itr %d/(epoch %.1f): time %g fps %.4f avg_perf %.4f avg_fps %.4f Enc_loss: %.4f Disc_loss: %.4f MPJPE: %.1f, PA_MPJPE: %.1f" + % (step, epoch, perf, fps, avg_perf, avg_fps, e_loss, d_loss, MPJPE, PA_MPJPE)) # add if step % self.log_img_step == 0: if not self.encoder_only: self.summary_writer.add_summary( result['summary_occasional'], global_step=result['step']) - #self.draw_results(result) #add - - if step % 5000 == 0: #(5000)---------------------------------------- - print("******************model_dir************",self.model_dir) + # self.draw_results(result) #add + + if step % 5000 == 0: # (5000)---------------------------------------- + print("******************model_dir************", self.model_dir) self.saver.save( sess, join(self.model_dir, 'model.ckpt'), global_step=step ) + # print("d_lr=== %.4f , e_lr== %.4f " % (self.d_lr, self.e_lr)) + self.summary_writer.flush() if epoch > self.max_epoch: self.sv.request_stop() step += 1 - print('Finish training on %s' % self.model_dir) \ No newline at end of file + + print('Finish training on %s' % self.model_dir) -- Gitee From ca5b11ee1790e9628547f378e564496943a7a5fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 01:54:54 +0000 Subject: [PATCH 04/11] update --- .../cv/HMR_ID0783_for_TensorFlow/src/data_loader.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py index 9494faa52..1b147b396 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/data_loader.py @@ -32,6 +32,7 @@ Data loader with data augmentation. Only used for training. """ from __future__ import absolute_import +from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -42,6 +43,7 @@ import tensorflow as tf from tf_smpl.batch_lbs import batch_rodrigues from util import data_utils +import os _3D_DATASETS = ['h36m', 'up', 'mpi_inf_3dhp'] @@ -448,6 +450,9 @@ class DataLoader(object): files_no3d = data_utils.get_all_files(self.dataset_dir, datasets_no3d) files_yes3d = data_utils.get_all_files(self.dataset_dir, datasets_yes3d) + rank_size = int(os.getenv('RANK_SIZE')) + rank_id = int(os.getenv('RANK_ID')) + if len(files_yes3d) == 0: print("Dont run this without any datasets with gt 3d") import ipdb; ipdb.set_trace() @@ -460,6 +465,8 @@ class DataLoader(object): cycle_length=10, block_length=1, num_parallel_calls = tf.data.experimental.AUTOTUNE) + if rank_size > 1 : + ds_yes3d = ds_yes3d.shard(rank_size, rank_id) options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 ds_yes3d = ds_yes3d.with_options(options) @@ -480,6 +487,8 @@ class DataLoader(object): cycle_length=10, block_length=1, num_parallel_calls = tf.data.experimental.AUTOTUNE) + if rank_size > 1: + ds_no3d = ds_no3d.shard(rank_size, rank_id) options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 ds_no3d = ds_no3d.with_options(options) @@ -559,6 +568,10 @@ class DataLoader(object): cycle_length=10, block_length=1, num_parallel_calls = tf.data.experimental.AUTOTUNE) + rank_size = int(os.getenv('RANK_SIZE')) + rank_id = int(os.getenv('RANK_ID')) + if rank_size > 1 : + ds_smpl = ds_smpl.shard(rank_size, rank_id) options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 ds_smpl = ds_smpl.with_options(options) -- Gitee From 30aa0a14156d6d37ed5a2d771148f28a56bc4c31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 07:05:37 +0000 Subject: [PATCH 05/11] update --- .../cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh index f9bdff60f..b58719f74 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_performance_1p.sh @@ -92,6 +92,7 @@ start_time=$(date +%s) #进入训练脚本目录,需要模型审视修改 cd $cur_path/src +export RANK_ID=$RANK_ID_START #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then -- Gitee From c41516ea042163fd45b7b729e21ec33151a49624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 07:47:39 +0000 Subject: [PATCH 06/11] update --- TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py index c05795181..efc410578 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py @@ -568,7 +568,7 @@ class HMRTrainer(object): custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # mix precision - custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("src/ops_info.json") + custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("./src/ops_info.json") custom_op.parameter_map["hcom_parallel"].b = True sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # close remap sess_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF -- Gitee From ccd8f24d41ba904cc617c3fddfc21118402cf279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 07:53:01 +0000 Subject: [PATCH 07/11] update --- TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py index efc410578..4f48d2068 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py @@ -559,6 +559,8 @@ class HMRTrainer(object): img_size=self.img_size, face_path=self.config.smpl_face_path) + print("111111111111111111",os.path.dirname(os.path.abspath(__file__))) + step = 0 perf_list = [] fps_list = [] -- Gitee From bca692f4939e095600debaacbe7f3c822b9e47d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 07:56:10 +0000 Subject: [PATCH 08/11] update --- TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py index 4f48d2068..3f14ae7e5 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py @@ -570,7 +570,7 @@ class HMRTrainer(object): custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # mix precision - custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("./src/ops_info.json") + custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("./ops_info.json") custom_op.parameter_map["hcom_parallel"].b = True sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # close remap sess_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF -- Gitee From 4d90984966d48089f68f9066028eef0a46c5204f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 08:35:37 +0000 Subject: [PATCH 09/11] update --- .../cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh index f067fdd26..41bc79d25 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh @@ -114,8 +114,8 @@ do #--d_lr 0.0008 1e-4 #--e_lr 0.00008 1e-5 nohup python3.7.5 main.py \ - --d_lr 1e-4 \ - --e_lr 1e-5 \ + --d_lr 0.0008 \ + --e_lr 0.00008 \ --log_img_step 100 \ --pretrained_model_path=${ckpt_path}/resnet_v2_50.ckpt \ --smpl_model_path=${ckpt_path}/neutral_smpl_with_cocoplus_reg.pkl \ -- Gitee From b0e1f8cfb88ada2998f41392400fa187c670bbde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Tue, 14 Jun 2022 09:37:24 +0000 Subject: [PATCH 10/11] update --- .../cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh index 41bc79d25..ddee430ed 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/test/train_full_8p.sh @@ -127,7 +127,7 @@ do --e_3d_weight 60. \ --datasets lsp,lsp_ext,mpii,coco,mpi_inf_3dhp \ --epoch 50 \ - --log_dir ${cur_path}/logs \ + --log_dir ${cur_path}/logs/${ASCEND_DEVICE_ID} \ --num_itr_per_epoch_config 0 > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -141,7 +141,7 @@ do ASCEND_DEVICE_ID=$RANK_ID nohup python3.7.5 eval.py \ - --load_path=${cur_path}/logs/model.ckpt-25000 \ + --load_path=${cur_path}/logs/${ASCEND_DEVICE_ID}/model.ckpt-25000 \ --smpl_model_path=${ckpt_path}/neutral_smpl_with_cocoplus_reg.pkl \ --eval_data_dir=${data_path}/mpi_inf_3dhp/test > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log 2>&1 & done -- Gitee From c90740f4a1a4ab7ddf476f74f34d2a8da5b1583b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=AE=9D=E5=BC=9F?= Date: Wed, 15 Jun 2022 00:58:41 +0000 Subject: [PATCH 11/11] update --- .../contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py index 3f14ae7e5..bd05cf5d1 100644 --- a/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py +++ b/TensorFlow/contrib/cv/HMR_ID0783_for_TensorFlow/src/trainer.py @@ -96,7 +96,7 @@ class HMRTrainer(object): # Data num_images = num_examples(config.datasets) num_mocap = num_examples(config.mocap_datasets) - print("num_images====", num_images) + # print("num_images====", num_images) self.num_itr_per_epoch = num_images / self.batch_size self.num_itr_per_epoch_config = config.num_itr_per_epoch_config # add self.num_mocap_itr_per_epoch = num_mocap / self.batch_size @@ -559,7 +559,7 @@ class HMRTrainer(object): img_size=self.img_size, face_path=self.config.smpl_face_path) - print("111111111111111111",os.path.dirname(os.path.abspath(__file__))) + # print("111111111111111111",os.path.dirname(os.path.abspath(__file__))) step = 0 perf_list = [] -- Gitee