From 4ecdd35649444e0d4590a82671a4724cb847f62d Mon Sep 17 00:00:00 2001 From: YUAN <1002548612@qq.com> Date: Fri, 4 Nov 2022 02:10:10 +0000 Subject: [PATCH 1/5] =?UTF-8?q?=E6=B7=BB=E5=8A=A08p=E6=89=A7=E8=A1=8C?= =?UTF-8?q?=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YUAN <1002548612@qq.com> --- .../test/train_full_8p.sh | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/test/train_full_8p.sh diff --git a/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/test/train_full_8p.sh new file mode 100644 index 000000000..25a5de69d --- /dev/null +++ b/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/test/train_full_8p.sh @@ -0,0 +1,183 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd`/../ + +#集合通信参数,不需要修改 + +export RANK_SIZE=8 +export JOB_ID=10087 +RANK_ID_START=0 +export RANK_TABLE_FILE=${cur_path}/configs/rank_table_8p.json +export HCCL_CONNECT_TIMEOUT=1800 + +# 数据集路径,保持为空,不需要修改 +data_path='./dataset/Market/' +ckpt_path='' + +#设置默认日志级别,不需要修改 +#export ASCEND_GLOBAL_LOG_LEVEL=3 +#export ASCEND_DEVICE_ID=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Unsupervised_Person_Re-identification_ID1028_for_TensorFlow" +#训练epoch +train_epochs=20 +#训练batch_size +batch_size=16 +#训练step +train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.495 + +#TF2.X独有,需要模型审视修改 +export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/test/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/test/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/test/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 + +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` + let a=RANK_ID*${corenum}/${RANK_SIZE} + let b=RANK_ID+1 + let c=b*${corenum}/${RANK_SIZE}-1 + + bind_core="taskset -c $a-$c" + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + cp ${data_path}/checkpoint/0.ckpt ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt/ + nohup ${bind_core} python3 train_on_npu.py \ + --dataset=Market \ + --NUM_EPOCH=20 \ + --save_ckpt=${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt/ \ + --data_path=${data_path}/ > ${cur_path}test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +done +wait +python3 evaluate_on_npu.py \ + --dataset=Market \ + --data_path=${data_path}/ \ + --save_ckpt=${cur_path}/test/output/$0/ckpt/ > ${cur_path}test/output/0/test_0.log 2>&1 +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +TrainingTime=`grep "s/step" $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk 'END {print $5}' |awk -F'ms' '{print $1}'` + + + +#输出训练精度,需要模型审视修改 +train_acc=`grep "mAP" $cur_path/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_acc}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=`awk 'BEGIN{printf "%.2f\n", 16/'${TrainingTime}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep 'loss:' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_acc}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log + -- Gitee From 94de4a46ef20330bc632fed5fb25f41c4bf74cd9 Mon Sep 17 00:00:00 2001 From: YUAN <1002548612@qq.com> Date: Fri, 4 Nov 2022 02:10:24 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20configs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../configs/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/configs/.keep diff --git a/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/configs/.keep b/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/configs/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From b3cb7ccaaefd8bbdef16c607eb48f8e8d895b63d Mon Sep 17 00:00:00 2001 From: YUAN <1002548612@qq.com> Date: Fri, 4 Nov 2022 02:10:38 +0000 Subject: [PATCH 3/5] =?UTF-8?q?=E6=B7=BB=E5=8A=A08p=E7=A1=AC=E4=BB=B6?= =?UTF-8?q?=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YUAN <1002548612@qq.com> --- .../configs/rank_table_8p.json | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/configs/rank_table_8p.json diff --git a/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/configs/rank_table_8p.json b/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/configs/rank_table_8p.json new file mode 100644 index 000000000..cd9041f3e --- /dev/null +++ b/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/configs/rank_table_8p.json @@ -0,0 +1,52 @@ +{ + "server_count":"1", + "server_list":[ + { + "server_id":"10.147.179.27", + "device":[ + { + "device_id":"0", + "device_ip":"192.168.100.100", + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":"192.168.101.100", + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":"192.168.102.100", + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":"192.168.103.100", + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":"192.168.100.101", + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":"192.168.101.101", + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":"192.168.102.101", + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":"192.168.103.101", + "rank_id":"7" + } + ] + } + ], + "status":"completed", + "version":"1.0" +} \ No newline at end of file -- Gitee From 472e760966e238a987bcd6c5cf3ccb434560c77b Mon Sep 17 00:00:00 2001 From: YUAN <1002548612@qq.com> Date: Fri, 4 Nov 2022 02:10:53 +0000 Subject: [PATCH 4/5] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Tens?= =?UTF-8?q?orFlow/contrib/cv/Unsupervised=5FPerson=5FRe-identification=5FI?= =?UTF-8?q?D1028=5Ffor=5FTensorFlow/train=5Fon=5Fnpu.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../train_on_npu.py | 211 ------------------ 1 file changed, 211 deletions(-) delete mode 100644 TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/train_on_npu.py diff --git a/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/train_on_npu.py b/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/train_on_npu.py deleted file mode 100644 index b87f98614..000000000 --- a/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/train_on_npu.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division, print_function, absolute_import - -import os -import sys -import numpy as np -import argparse -import tensorflow as tf - -from keras.applications.resnet50 import ResNet50 -from keras.backend.tensorflow_backend import set_session -from keras.preprocessing.image import ImageDataGenerator -from keras.optimizers import SGD -from keras.preprocessing import image -from keras.applications.resnet50 import preprocess_input -from keras.utils.np_utils import to_categorical -from keras.optimizers import SGD -from keras.layers import Input -from keras.layers import Dense, Flatten, Dropout -from keras.initializers import RandomNormal -from keras.models import Model -from keras import backend as K -from keras.models import load_model -from sklearn.cluster import KMeans -from npu_bridge.npu_init import * -# from npu_bridge.estimator import npu_ops -from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig -import time - - -if __name__ == '__main__': - - code_dir = os.path.dirname(__file__) - work_dir = os.getcwd() - print("===>>>code_dir:{}, work_dir:{}".format(code_dir, work_dir)) - - # parser - parser = argparse.ArgumentParser(description='unsupervised training') - parser.add_argument('--dataset', type=str, required=True, default='', - help='Training dataset.') - parser.add_argument('--data_path', type=str, required=True, default='', - help='Directory contains required dataset.') - parser.add_argument('--END', type=int, default=25, help='ckpt range') - parser.add_argument('--end_step', type=int, default=0, help='contrl steps_per_epoch') - parser.add_argument('--NUM_EPOCH', type=int, default=20, help='train epochs') - parser.add_argument('--BATCH_SIZE', type=int, default=16, help='BATCH_SIZE') - parser.add_argument('--save_ckpt', type=str, default='./save_ckpt', help='save ckpt') - args = parser.parse_args() - - # dataset - if args.dataset.upper() == 'DUKE': - NUM_CLUSTER = 700 - else: - NUM_CLUSTER = 750 - - print(NUM_CLUSTER) - DATASET = args.data_path - save_ckpt=args.save_ckpt - LIST = os.path.join(DATASET, 'train.list') - TRAIN = os.path.join(DATASET, 'bounding_box_train') - - # learning - START = 1 - END = args.END - LAMBDA = 0.85 - NUM_EPOCH = args.NUM_EPOCH - BATCH_SIZE = args.BATCH_SIZE - - # session - sess_config = tf.ConfigProto() - custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() - custom_op.name = "NpuOptimizer" - custom_op.parameter_map["use_off_line"].b = True - custom_op.parameter_map["dynamic_input"].b = True - custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile") - - custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(21 * 1024 * 1024 * 1024)) - custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes(str(10 * 1024 * 1024 * 1024)) - - sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF - sess = tf.Session(config=sess_config) - K.set_session(sess) - # set_session(sess) - - # load data - unlabeled_images = [] - with open(LIST, 'r') as f: - print("--------openlist----------") - for line in f: - line = line.strip() - img, lbl = line.split() - img = image.load_img(os.path.join(TRAIN, img), target_size=[224, 224]) - img = image.img_to_array(img) - img = np.expand_dims(img, axis=0) - img = preprocess_input(img) - unlabeled_images.append(img) - - print("--------datagen----------") - - datagen = ImageDataGenerator(featurewise_center=False, - samplewise_center=False, - featurewise_std_normalization=False, - samplewise_std_normalization=False, - zca_whitening=False, - rotation_range=20, # 0. - width_shift_range=0.2, # 0. - height_shift_range=0.2, # 0. - shear_range=0., - zoom_range=0., - channel_shift_range=0., - fill_mode='nearest', - cval=0., - horizontal_flip=False, - vertical_flip=False, - rescale=None, - data_format=K.image_data_format()) - - # calculate the similarity matrix - center_t = tf.placeholder(tf.float32, (None, None)) - other_t = tf.placeholder(tf.float32, (None, None)) - center_t_norm = tf.nn.l2_normalize(center_t, dim=1) - other_t_norm = tf.nn.l2_normalize(other_t, dim=1) - similarity = tf.matmul(center_t_norm, other_t_norm, transpose_a=False, transpose_b=True) - - #checkpoint = os.path.join(DATASET, "checkpoint") - checkpoint1 = os.path.join(save_ckpt, "0.ckpt") - init_model = load_model(checkpoint1) - x1 = init_model.get_layer('avg_pool').output - x = x1 - x = Flatten(name='flatten')(x) - x = Dropout(0.5)(x) - x = Dense(NUM_CLUSTER, activation='softmax', name='new_fc8', - kernel_initializer=RandomNormal(mean=0.0, stddev=0.001))(x) - init_model = Model(input=init_model.input, output=x1) - net = Model(input=init_model.input, output=x) - fc8_weights = net.get_layer('new_fc8').get_weights() - for layer in net.layers: - layer.trainable = True - net.compile(optimizer=SGD(lr=0.001, momentum=0.9), loss='categorical_crossentropy') - - # iterate - for ckpt in range(START, END + 1): - print("--------iterate ckpt----------") - #checkpoint = os.path.join(DATASET, "checkpoint") - checkpoint1 = os.path.join(save_ckpt, '%d.ckpt' % (ckpt - 1)) - init_model.load_weights(checkpoint1, by_name=True) - - # extract features - features = [] - for img in unlabeled_images: - feature = init_model.predict(img) - features.append(np.squeeze(feature)) - features = np.array(features) - - # clustering - kmeans = KMeans(n_clusters=NUM_CLUSTER).fit(features) - - # select centers - distances = kmeans.transform(features) # num images * NUM_CLUSTER - center_idx = np.argmin(distances, axis=0) - centers = [features[i] for i in center_idx] - - # calculate similarity matrix - similarities = sess.run(similarity, {center_t: centers, other_t: features}) # NUM_CLUSTER * num images - - # calculate similarity matrixnet > LAMBDA)[:, 1]) - reliable_image_idx = np.unique(np.argwhere(similarities > LAMBDA)[:, 1]) - print('ckpt %d: # reliable images %d' % (ckpt, len(reliable_image_idx))) - sys.stdout.flush() - images = np.array([unlabeled_images[i][0] for i in reliable_image_idx]) - labels = to_categorical([kmeans.labels_[i] for i in reliable_image_idx]) - - # retrain: fine tune - #checkpoint = os.path.join(DATASET, "checkpoint") - checkpoint1 = os.path.join(save_ckpt, "0.ckpt") - net.load_weights(checkpoint1, by_name=True) - net.get_layer('new_fc8').set_weights(fc8_weights) - - net.fit_generator(datagen.flow(images, labels, batch_size=BATCH_SIZE), - steps_per_epoch=len(images) / BATCH_SIZE + 1 - args.end_step, - epochs=NUM_EPOCH) - net.save(os.path.join(save_ckpt, '%d.ckpt' % ckpt)) - # tf.io.write_graph(sess.graph, './checkpoint', 'graph.pbtxt', as_text=True) - sess.close() -- Gitee From 19b8787b2c69cddc8c82f269329a9d9da61c566a Mon Sep 17 00:00:00 2001 From: YUAN <1002548612@qq.com> Date: Fri, 4 Nov 2022 02:11:07 +0000 Subject: [PATCH 5/5] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=88=86=E5=B8=83?= =?UTF-8?q?=E5=BC=8F=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YUAN <1002548612@qq.com> --- .../train_on_npu.py | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/train_on_npu.py diff --git a/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/train_on_npu.py b/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/train_on_npu.py new file mode 100644 index 000000000..e2054a234 --- /dev/null +++ b/TensorFlow/contrib/cv/Unsupervised_Person_Re-identification_ID1028_for_TensorFlow/train_on_npu.py @@ -0,0 +1,246 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division, print_function, absolute_import +from operator import le + +import os +import sys +import numpy as np +import argparse +import tensorflow as tf + +from keras.preprocessing.image import ImageDataGenerator +from keras.optimizers import SGD +from keras.preprocessing import image +from keras.applications.resnet50 import preprocess_input +from keras.utils.np_utils import to_categorical +from keras.optimizers import SGD +from keras.layers import Input +from keras.layers import Dense, Flatten, Dropout +from keras.initializers import RandomNormal +from keras.models import Model +from keras import backend as K +from keras.models import load_model +from sklearn.cluster import KMeans +from npu_bridge.npu_init import * +# from npu_bridge.estimator import npu_ops +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +import time +RANK_SIZE = int(os.environ['RANK_SIZE']) +RANK_ID = int(os.environ['RANK_ID']) + +if __name__ == '__main__': + + code_dir = os.path.dirname(__file__) + work_dir = os.getcwd() + print("===>>>code_dir:{}, work_dir:{}".format(code_dir, work_dir)) + + # parser + parser = argparse.ArgumentParser(description='unsupervised training') + parser.add_argument('--dataset', type=str, required=True, default='', + help='Training dataset.') + parser.add_argument('--data_path', type=str, required=True, default='', + help='Directory contains required dataset.') + parser.add_argument('--END', type=int, default=25, help='ckpt range') + parser.add_argument('--end_step', type=int, default=0, + help='contrl steps_per_epoch') + parser.add_argument('--NUM_EPOCH', type=int, + default=20, help='train epochs') + parser.add_argument('--BATCH_SIZE', type=int, + default=16, help='BATCH_SIZE') + parser.add_argument('--save_ckpt', type=str, + default='./save_ckpt', help='save ckpt') + args = parser.parse_args() + + # dataset + if args.dataset.upper() == 'DUKE': + NUM_CLUSTER = 700 + else: + NUM_CLUSTER = 750 + + print(NUM_CLUSTER) + DATASET = args.data_path + save_ckpt = args.save_ckpt + LIST = os.path.join(DATASET, 'train.list') + TRAIN = os.path.join(DATASET, 'bounding_box_train') + + # learning + START = 1 + END = args.END + LAMBDA = 0.85 + NUM_EPOCH = args.NUM_EPOCH + BATCH_SIZE = args.BATCH_SIZE + + # session + sess_config = tf.ConfigProto() + custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True + + custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes( + str(21 * 1024 * 1024 * 1024)) + custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes( + str(10 * 1024 * 1024 * 1024)) + # Allreduce并行 + if RANK_SIZE > 1: + custom_op.parameter_map["hcom_parallel"].b = True + + sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + sess = tf.Session(config=sess_config) + K.set_session(sess) + # set_session(sess) + # load data + unlabeled_images = [] + with open(LIST, 'r') as f: + print("--------openlist----------") + for line in f: + line = line.strip() + img, lbl = line.split() + img = image.load_img(os.path.join(TRAIN, img), + target_size=[224, 224]) + img = image.img_to_array(img) + img = np.expand_dims(img, axis=0) + img = preprocess_input(img) + unlabeled_images.append(img) + + print("--------datagen----------") + + datagen = ImageDataGenerator(featurewise_center=False, + samplewise_center=False, + featurewise_std_normalization=False, + samplewise_std_normalization=False, + zca_whitening=False, + rotation_range=20, # 0. + width_shift_range=0.2, # 0. + height_shift_range=0.2, # 0. + shear_range=0., + zoom_range=0., + channel_shift_range=0., + fill_mode='nearest', + cval=0., + horizontal_flip=False, + vertical_flip=False, + rescale=None, + data_format=K.image_data_format()) + + # calculate the similarity matrix + center_t = tf.placeholder(tf.float32, (None, None)) + other_t = tf.placeholder(tf.float32, (None, None)) + center_t_norm = tf.nn.l2_normalize(center_t, dim=1) + other_t_norm = tf.nn.l2_normalize(other_t, dim=1) + similarity = tf.matmul(center_t_norm, other_t_norm, + transpose_a=False, transpose_b=True) + + #checkpoint = os.path.join(DATASET, "checkpoint") + checkpoint1 = os.path.join(save_ckpt, "0.ckpt") + init_model = load_model(checkpoint1) + x1 = init_model.get_layer('avg_pool').output + x = x1 + x = Flatten(name='flatten')(x) + x = Dropout(0.5)(x) + x = Dense(NUM_CLUSTER, activation='softmax', name='new_fc8', + kernel_initializer=RandomNormal(mean=0.0, stddev=0.001))(x) + init_model = Model(input=init_model.input, output=x1) + net = Model(input=init_model.input, output=x) + fc8_weights = net.get_layer('new_fc8').get_weights() + for layer in net.layers: + layer.trainable = True + if RANK_SIZE > 1: + net.compile(optimizer=npu_distributed_optimizer_wrapper(SGD(lr=0.001, momentum=0.9)), + loss='categorical_crossentropy') + callbacks = [NPUBroadcastGlobalVariablesCallback(0)] # 变量进行广播 + else: + net.compile(optimizer=SGD(lr=0.001, momentum=0.9), + loss='categorical_crossentropy') + + # 将images 和 labels 根据rankID分成8份 + if RANK_SIZE > 1: + num_part = len(unlabeled_images) // RANK_SIZE + unlabeled_images = unlabeled_images[RANK_ID * num_part:(RANK_ID + 1) * num_part] + + # iterate + for ckpt in range(START, END + 1): + print("--------iterate ckpt----------") + #checkpoint = os.path.join(DATASET, "checkpoint") + checkpoint1 = os.path.join(save_ckpt, '%d.ckpt' % (ckpt - 1)) + init_model.load_weights(checkpoint1, by_name=True) + + # extract features + features = [] + for img in unlabeled_images: + feature = init_model.predict(img) + features.append(np.squeeze(feature)) + features = np.array(features) + + # clustering + kmeans = KMeans(n_clusters=NUM_CLUSTER).fit(features) + + # select centers + distances = kmeans.transform(features) # num images * NUM_CLUSTER + center_idx = np.argmin(distances, axis=0) + centers = [features[i] for i in center_idx] + + # calculate similarity matrix + # NUM_CLUSTER * num images + similarities = sess.run( + similarity, {center_t: centers, other_t: features}) + + # calculate similarity matrixnet > LAMBDA)[:, 1]) + reliable_image_idx = np.unique( + np.argwhere(similarities > LAMBDA)[:, 1]) + print('ckpt %d: # reliable images %d' % + (ckpt, len(reliable_image_idx))) + + images = np.array([unlabeled_images[i][0] for i in reliable_image_idx]) + labels = to_categorical([kmeans.labels_[i] + for i in reliable_image_idx]) + + # 将images 和 labels 根据rankID分成8份 + if RANK_SIZE > 1: + images = images[:1440] + labels = labels[:1440] + + # retrain: fine tune + #checkpoint = os.path.join(DATASET, "checkpoint") + checkpoint1 = os.path.join(save_ckpt, "0.ckpt") + net.load_weights(checkpoint1, by_name=True) + net.get_layer('new_fc8').set_weights(fc8_weights) + if RANK_SIZE > 1: + net.fit_generator(datagen.flow(images, labels, batch_size=BATCH_SIZE), + steps_per_epoch=len( + images) // BATCH_SIZE - args.end_step, + epochs=NUM_EPOCH, callbacks=callbacks) + else: + net.fit_generator(datagen.flow(images, labels, batch_size=BATCH_SIZE), + steps_per_epoch=len( + images) // BATCH_SIZE - args.end_step, + epochs=NUM_EPOCH) + net.save(os.path.join(save_ckpt, '%d.ckpt' % ckpt)) + sess.close() + -- Gitee