diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/README.md b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/README.md index 949033f71ad3429146a5fe25234073526c41bc9a..b4d1b16bbba2c3d7f80874bd3d8dddd8d02d2708 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/README.md +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/README.md @@ -1,45 +1,54 @@ -# 概述 - mean-teacher是一种用于图像分类的半监督学习方法,能够在拥有少量有标签数据的情况下训练出分类准确率很高的网络模型。 - -- 论文链接: [Weight-averaged consistency targets improve semi-supervised deep learning results](https://arxiv.org/abs/1703.01780) - -- 官方代码仓: [链接](https://github.com/CuriousAI/mean-teacher/) - -- 精度性能比较: - -| | 论文 | GPU | Ascend | -| ------ | ------ | ------ | ------ | -| error | 12.31% | 13.50% | 14.20% | -| 性能(s/steps) | | | | -# 环境 - - python 3.7.5 - - Tensorflow 1.15 - - Ascend910 - -# 训练 -## 数据集 - 使用./prepare_data.sh脚本预处理数据集 -##训练超参见train.py参数列表 -## 单卡训练命令 -```commandline -sh ./test/train_full_1p.sh -``` - -# 功能测试 -少量step(单epoch)运行 -```commandline -sh ./test/train_performance_1p.sh -``` - -# 模型固化 - -# 部分脚本和示例代码 -```text -├── README.md //说明文档 -├── requirements.txt //依赖 -├──test //训练脚本目录 -│ ├──train_performance_1p.sh -│ ├──train_full_1p.sh -├──train_cifar10.py //训练脚本 -``` - +# 概述 + mean-teacher是一种用于图像分类的半监督学习方法,能够在拥有少量有标签数据的情况下训练出分类准确率很高的网络模型。 + +- 论文链接: [Weight-averaged consistency targets improve semi-supervised deep learning results](https://arxiv.org/abs/1703.01780) + +- 官方代码仓: [链接](https://github.com/CuriousAI/mean-teacher/) + +- 精度性能比较: + +| | 论文 | GPU | Ascend | +| ------ | ------ | -- | ------ | +| error | 12.3% | 13.5% | 14.6% | +| 性能(s/steps) | | 1.17 | 0.30 | +# 环境 + - python 3.7.5 + - Tensorflow 1.15 + - Ascend910 + +# 训练 +## 数据集 + 使用./prepare_data.sh脚本预处理数据集 +## 训练超参见train_cifar10.py参数列表 +## 单卡训练命令 +首先在脚本test/train_full_1p.sh中,配置train_steps、data_path等参数,请用户根据实际路径配置data_path,或者在启动训练的命令行中以参数形式下发 + +-启动训练 +```commandline +bash train_full_1p.sh --data_path=../data +``` + +# 功能测试 +少量step运行 +```commandline +bash ./test/train_performance_1p.sh +``` + +# 模型固化 +准备checkpoint,默认为 ./ckpt/checkpoint-40000 +- 执行脚本,结果将保存在 +```commandline +python3 freeze_graph.py +``` +# 部分脚本和示例代码 +```text +├── README.md //说明文档 +├── requirements.txt //依赖 +├──test //训练脚本目录 +│ ├──train_performance_1p.sh +│ ├──train_full_1p.sh +├──train_cifar10.py //训练脚本 +|——freeze_graph.py //固化脚本 +``` +# 输出 +模型存储路径为test/output/ASCEND_DEVICE_ID,包括训练的log以及checkpoints文件。loss信息在文件test/output/{ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。 模型固化输出为pb_model/milking_cowmask.pb diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/__init__.py b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/__init__.py index 89552b1d3f5f6255840161c8c17cf314ab3fedff..a5f8598aa44dc6b32162d43ee60c98a1725037ef 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/__init__.py +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/__init__.py @@ -1,13 +1,13 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. \ No newline at end of file diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/cifar10_final_eval.py b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/cifar10_final_eval.py index 7b602d71568949c8f510a5b5f8d3bd8f4b1a5189..306539822b21b401147889f08aecbd5c4c554980 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/cifar10_final_eval.py +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/cifar10_final_eval.py @@ -1,112 +1,112 @@ -# Copyright (c) 2018, Curious AI Ltd. All rights reserved. -# -# This work is licensed under the Creative Commons Attribution-NonCommercial -# 4.0 International License. To view a copy of this license, visit -# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to -# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. -# ============================================================================ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""CIFAR-10 final evaluation""" -from npu_bridge.npu_init import * - -import logging -import sys - -from experiments.run_context import RunContext -import tensorflow as tf - -from datasets import Cifar10ZCA -from mean_teacher.model import Model -from mean_teacher import minibatching - - -LOG = logging.getLogger('main') - - -def parameters(): - test_phase = True - for n_labeled in [1000, 2000, 4000, 'all']: - for model_type in ['mean_teacher', 'pi']: - if n_labeled == 'all': - n_runs = 4 - else: - n_runs = 10 - for data_seed in range(2000, 2000 + n_runs): - yield { - 'test_phase': test_phase, - 'model_type': model_type, - 'n_labeled': n_labeled, - 'data_seed': data_seed - } - - -def model_hyperparameters(model_type, n_labeled): - assert model_type in ['mean_teacher', 'pi'] - if n_labeled == 'all': - return { - 'n_labeled_per_batch': 100, - 'max_consistency_cost': 100.0, - 'apply_consistency_to_labeled': True, - 'ema_consistency': model_type == 'mean_teacher' - } - elif isinstance(n_labeled, int): - return { - 'n_labeled_per_batch': 'vary', - 'max_consistency_cost': 100.0 * n_labeled / 50000, - 'apply_consistency_to_labeled': True, - 'ema_consistency': model_type == 'mean_teacher' - } - else: - msg = "Unexpected combination: {model_type}, {n_labeled}" - assert False, msg.format(locals()) - - -def run(test_phase, n_labeled, data_seed, model_type): - minibatch_size = 100 - hyperparams = model_hyperparameters(model_type, n_labeled) - - tf.reset_default_graph() - model = Model(RunContext(__file__, data_seed)) - - cifar = Cifar10ZCA(n_labeled=n_labeled, - data_seed=data_seed, - test_phase=test_phase) - - model['flip_horizontally'] = True - model['ema_consistency'] = hyperparams['ema_consistency'] - model['max_consistency_cost'] = hyperparams['max_consistency_cost'] - model['apply_consistency_to_labeled'] = hyperparams['apply_consistency_to_labeled'] - model['adam_beta_2_during_rampup'] = 0.999 - model['ema_decay_during_rampup'] = 0.999 - model['normalize_input'] = False # Keep ZCA information - model['rampdown_length'] = 25000 - model['training_length'] = 150000 - - training_batches = minibatching.training_batches(cifar.training, - minibatch_size, - hyperparams['n_labeled_per_batch']) - evaluation_batches_fn = minibatching.evaluation_epoch_generator(cifar.evaluation, - minibatch_size) - - tensorboard_dir = model.save_tensorboard_graph() - LOG.info("Saved tensorboard graph to %r", tensorboard_dir) - - model.train(training_batches, evaluation_batches_fn) - - -if __name__ == "__main__": - for run_params in parameters(): - run(**run_params) - +# Copyright (c) 2018, Curious AI Ltd. All rights reserved. +# +# This work is licensed under the Creative Commons Attribution-NonCommercial +# 4.0 International License. To view a copy of this license, visit +# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to +# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CIFAR-10 final evaluation""" +from npu_bridge.npu_init import * + +import logging +import sys + +from experiments.run_context import RunContext +import tensorflow as tf + +from datasets import Cifar10ZCA +from mean_teacher.model import Model +from mean_teacher import minibatching + + +LOG = logging.getLogger('main') + + +def parameters(): + test_phase = True + for n_labeled in [1000, 2000, 4000, 'all']: + for model_type in ['mean_teacher', 'pi']: + if n_labeled == 'all': + n_runs = 4 + else: + n_runs = 10 + for data_seed in range(2000, 2000 + n_runs): + yield { + 'test_phase': test_phase, + 'model_type': model_type, + 'n_labeled': n_labeled, + 'data_seed': data_seed + } + + +def model_hyperparameters(model_type, n_labeled): + assert model_type in ['mean_teacher', 'pi'] + if n_labeled == 'all': + return { + 'n_labeled_per_batch': 100, + 'max_consistency_cost': 100.0, + 'apply_consistency_to_labeled': True, + 'ema_consistency': model_type == 'mean_teacher' + } + elif isinstance(n_labeled, int): + return { + 'n_labeled_per_batch': 'vary', + 'max_consistency_cost': 100.0 * n_labeled / 50000, + 'apply_consistency_to_labeled': True, + 'ema_consistency': model_type == 'mean_teacher' + } + else: + msg = "Unexpected combination: {model_type}, {n_labeled}" + assert False, msg.format(locals()) + + +def run(test_phase, n_labeled, data_seed, model_type): + minibatch_size = 100 + hyperparams = model_hyperparameters(model_type, n_labeled) + + tf.reset_default_graph() + model = Model(RunContext(__file__, data_seed, './output')) + + cifar = Cifar10ZCA(n_labeled=n_labeled, + data_seed=data_seed, + test_phase=test_phase) + + model['flip_horizontally'] = True + model['ema_consistency'] = hyperparams['ema_consistency'] + model['max_consistency_cost'] = hyperparams['max_consistency_cost'] + model['apply_consistency_to_labeled'] = hyperparams['apply_consistency_to_labeled'] + model['adam_beta_2_during_rampup'] = 0.999 + model['ema_decay_during_rampup'] = 0.999 + model['normalize_input'] = False # Keep ZCA information + model['rampdown_length'] = 25000 + model['training_length'] = 150000 + + training_batches = minibatching.training_batches(cifar.training, + minibatch_size, + hyperparams['n_labeled_per_batch']) + evaluation_batches_fn = minibatching.evaluation_epoch_generator(cifar.evaluation, + minibatch_size) + + tensorboard_dir = model.save_tensorboard_graph() + LOG.info("Saved tensorboard graph to %r", tensorboard_dir) + + model.train(training_batches, evaluation_batches_fn) + + +if __name__ == "__main__": + for run_params in parameters(): + run(**run_params) + diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/experiments/run_context.py b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/experiments/run_context.py index 5cbadd25f0f7041d57cfb51289a04e7970bde8c1..f857605602533948a0cb52a641ff301910b1b7e1 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/experiments/run_context.py +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/experiments/run_context.py @@ -1,84 +1,86 @@ -# Copyright (c) 2018, Curious AI Ltd. All rights reserved. -# -# This work is licensed under the Creative Commons Attribution-NonCommercial -# 4.0 International License. To view a copy of this license, visit -# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to -# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. -# ============================================================================ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from npu_bridge.npu_init import * -from datetime import datetime -from collections import defaultdict -import threading -import time -import logging -import os - -from pandas import DataFrame -from collections import defaultdict - - -class TrainLog: - """Saves training logs in Pandas msgpacks""" - - INCREMENTAL_UPDATE_TIME = 300 - - def __init__(self, directory, name): - self.log_file_path = "{}/{}.msgpack".format(directory, name) - self._log = defaultdict(dict) - self._log_lock = threading.RLock() - self._last_update_time = time.time() - self.INCREMENTAL_UPDATE_TIME - - def record_single(self, step, column, value): - self._record(step, {column: value}) - - def record(self, step, col_val_dict): - self._record(step, col_val_dict) - - #def save(self): - #df = self._as_dataframe() - #df.to_msgpack(self.log_file_path, compress='zlib') - - def _record(self, step, col_val_dict): - with self._log_lock: - self._log[step].update(col_val_dict) - if time.time() - self._last_update_time >= self.INCREMENTAL_UPDATE_TIME: - self._last_update_time = time.time() - #self.save() - - def _as_dataframe(self): - with self._log_lock: - return DataFrame.from_dict(self._log, orient='index') - - -class RunContext: - """Creates directories and files for the run""" - - def __init__(self, runner_file, run_idx): - logging.basicConfig(level=logging.INFO, format='%(message)s') - runner_name = os.path.basename(runner_file).split(".")[0] - self.result_dir = "{root}/{runner_name}/{date:%Y-%m-%d_%H:%M:%S}/{run_idx}".format( - root='results', - runner_name=runner_name, - date=datetime.now(), - run_idx=run_idx - ) - self.transient_dir = self.result_dir + "/transient" - os.makedirs(self.result_dir) - os.makedirs(self.transient_dir) - - def create_train_log(self, name): - return TrainLog(self.result_dir, name) - +# Copyright (c) 2018, Curious AI Ltd. All rights reserved. +# +# This work is licensed under the Creative Commons Attribution-NonCommercial +# 4.0 International License. To view a copy of this license, visit +# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to +# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from npu_bridge.npu_init import * +from datetime import datetime +from collections import defaultdict +import threading +import time +import logging +import os + +#from pandas import DataFrame +from collections import defaultdict + + +class TrainLog: + """Saves training logs in Pandas msgpacks""" + + INCREMENTAL_UPDATE_TIME = 300 + + def __init__(self, directory, name): + self.log_file_path = "{}/{}.msgpack".format(directory, name) + self._log = defaultdict(dict) + self._log_lock = threading.RLock() + self._last_update_time = time.time() - self.INCREMENTAL_UPDATE_TIME + + def record_single(self, step, column, value): + self._record(step, {column: value}) + + def record(self, step, col_val_dict): + self._record(step, col_val_dict) + + #def save(self): + #df = self._as_dataframe() + #df.to_msgpack(self.log_file_path, compress='zlib') + + def _record(self, step, col_val_dict): + with self._log_lock: + self._log[step].update(col_val_dict) + if time.time() - self._last_update_time >= self.INCREMENTAL_UPDATE_TIME: + self._last_update_time = time.time() + #self.save() + + # def _as_dataframe(self): + # with self._log_lock: + # return DataFrame.from_dict(self._log, orient='index') + + +class RunContext: + """Creates directories and files for the run""" + + def __init__(self, runner_file, run_idx, result_pah): + logging.basicConfig(level=logging.INFO, format='%(message)s') + # runner_name = os.path.basename(runner_file).split(".")[0] + # self.result_dir = "{root}/{runner_name}/{date:%Y-%m-%d_%H:%M:%S}/{run_idx}".format( + # root='results', + # runner_name=runner_name, + # date=datetime.now(), + # run_idx=run_idx + # ) + self.result_dir = result_pah + self.transient_dir = self.result_dir + "/transient" + if not os.path.exists(self.result_dir): + os.makedirs(self.result_dir) + os.makedirs(self.transient_dir) + + def create_train_log(self, name): + return TrainLog(self.result_dir, name) + diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/freeze_graph.py b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/freeze_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..7fc00ebc218a29fdf9c7591bfc79016f658b88bd --- /dev/null +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/freeze_graph.py @@ -0,0 +1,71 @@ +# coding=utf-8 +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tensorflow.python.tools import freeze_graph +import argparse +import logging +import tensorflow as tf +from experiments.run_context import RunContext +from mean_teacher.model import Model + +logging.basicConfig(level=logging.INFO) +LOG = logging.getLogger('main') +parser = argparse.ArgumentParser() +parser.add_argument('--ckpt_path',type=str, default='./ckpt/checkpoint-40000',help='The path of checkpoint') + + +#running function +def run(args): + ckpt_path = args.ckpt_path + model = Model(RunContext(__file__, 0, './output')) + + LOG.info("Saved tensorboard graph to ./pb_model") + + + logits = model.class_logits_ema + output = tf.argmax(logits, -1, output_type=tf.int32, name="output") #output node will be used to inference + with tf.Session() as sess: + tf.train.write_graph(sess.graph_def, './pb_model', 'output_empty.pb') # save pb file with output node + freeze_graph.freeze_graph( + input_graph='./pb_model/output_empty.pb', # the pb file with output node + input_saver='', + input_binary=False, + input_checkpoint=ckpt_path, # input checkpoint file path + output_node_names='output', # the name of output node in pb file + restore_op_name='save/restore_all', + filename_tensor_name='save/Const:0', + output_graph='./pb_model/mean-teacher.pb', # path of output graph + clear_devices=False, + initializer_nodes='') + logging.info('done') +if __name__ == "__main__": + args = parser.parse_args() + run(args) + + diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelarts_entry_acc.py b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelarts_entry_acc.py new file mode 100644 index 0000000000000000000000000000000000000000..13077b10e660de32d6f7861257a50e1a01ede9ba --- /dev/null +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelarts_entry_acc.py @@ -0,0 +1,63 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import sys + +# 解析输入参数data_url +parser = argparse.ArgumentParser() +parser.add_argument("--data_url", type=str, default="/home/ma-user/modelarts/inputs/data_url_0") +parser.add_argument("--train_url", type=str, default="/home/ma-user/modelarts/outputs/train_url_0/") +config = parser.parse_args() + +print("[CANN-Modelzoo] code_dir path is [%s]" % (sys.path[0])) +code_dir = sys.path[0] +os.chdir(code_dir) +print("[CANN-Modelzoo] work_dir path is [%s]" % (os.getcwd())) + +print("[CANN-Modelzoo] before train - list my run files:") +os.system("ls -al /usr/local/Ascend/ascend-toolkit/") + +print("[CANN-Modelzoo] before train - list my dataset files:") +os.system("ls -al %s" % config.data_url) + +print("[CANN-Modelzoo] start run train shell") +# 设置sh文件格式为linux可执行 +os.system("dos2unix ./test/*") + +# 执行train_full_1p.sh或者train_performance_1p.sh,需要用户自己指定 +# full和performance的差异,performance只需要执行很少的step,控制在15分钟以内,主要关注性能FPS +os.system("bash ./test/train_full_1p.sh --data_path=%s --output_path=%s " % (config.data_url, config.train_url)) + +print("[CANN-Modelzoo] finish run train shell") + +# 将当前执行目录所有文件拷贝到obs的output进行备份 +print("[CANN-Modelzoo] after train - list my output files:") +os.system("cp -r %s %s " % (code_dir, config.train_url)) +os.system("ls -al %s" % config.train_url) diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelarts_entry_perf.py b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelarts_entry_perf.py new file mode 100644 index 0000000000000000000000000000000000000000..14384e227a0fa90a514254590aef5078c62ff700 --- /dev/null +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelarts_entry_perf.py @@ -0,0 +1,63 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import sys + +# 解析输入参数data_url +parser = argparse.ArgumentParser() +parser.add_argument("--data_url", type=str, default="/home/ma-user/modelarts/inputs/data_url_0") +parser.add_argument("--train_url", type=str, default="/home/ma-user/modelarts/outputs/train_url_0/") +config = parser.parse_args() + +print("[CANN-Modelzoo] code_dir path is [%s]" % (sys.path[0])) +code_dir = sys.path[0] +os.chdir(code_dir) +print("[CANN-Modelzoo] work_dir path is [%s]" % (os.getcwd())) + +print("[CANN-Modelzoo] before train - list my run files:") +os.system("ls -al /usr/local/Ascend/ascend-toolkit/") + +print("[CANN-Modelzoo] before train - list my dataset files:") +os.system("ls -al %s" % config.data_url) + +print("[CANN-Modelzoo] start run train shell") +# 设置sh文件格式为linux可执行 +os.system("dos2unix ./test/*") + +# 执行train_full_1p.sh或者train_performance_1p.sh,需要用户自己指定 +# full和performance的差异,performance只需要执行很少的step,控制在15分钟以内,主要关注性能FPS +os.system("bash ./test/train_performance_1p.sh --data_path=%s --output_path=%s " % (config.data_url, config.train_url)) + +print("[CANN-Modelzoo] finish run train shell") + +# 将当前执行目录所有文件拷贝到obs的output进行备份 +print("[CANN-Modelzoo] after train - list my output files:") +os.system("cp -r %s %s " % (code_dir, config.train_url)) +os.system("ls -al %s" % config.train_url) diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelzoo_level.txt b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelzoo_level.txt index 3aaa91c1deb7a7eadf44ea3e5ac28be53a378cf2..7eeb8d729d7fb2dd94b91dcf79f8eabd5cfc5b77 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelzoo_level.txt +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/modelzoo_level.txt @@ -1,6 +1,3 @@ -GPUStatus:OK -NPUMigrationStatus:POK -FuncStatus:OK -PrecisionStatus:POK -AutoTune:NOK -PerfStatus:POK \ No newline at end of file +FuncStatus:OK +PerfStatus:OK +PrecisionStatus:OK diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/prepare_data.sh b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/prepare_data.sh index 96d256e9f23a1b08c0bfb347af0820b7529b255c..c2d9c1ade9e1ce843f2f07b320d23c1ab31c9849 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/prepare_data.sh +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/prepare_data.sh @@ -40,7 +40,7 @@ mkdir -p data/images/cifar/cifar10 echo echo "Preprocessing CIFAR-10" -python datasets/preprocess_cifar10.py +python3 datasets/preprocess_cifar10.py echo echo "All done!" diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/test/train_full_1p.sh index 59315a88024f710357be60838d75cf1f0d8f0413..f77819ab16aeb196c92bbfb273af805e23eae948 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/test/train_full_1p.sh @@ -1,160 +1,183 @@ #!/bin/bash -#当前路径,不需要修改 -cur_path=`pwd`/../ +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` -#集合通信参数,不需要修改 +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` export RANK_SIZE=1 +export RANK_ID=0 export JOB_ID=10087 -RANK_ID_START=0 - - -# 数据集路径,保持为空,不需要修改 -data_path='' -ckpt_path='' - -#设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 -#export ASCEND_DEVICE_ID=3 - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="MEAN-TEACHER_ID0789_for_TensorFlow" -#训练epoch -train_epochs= -#训练batch_size -batch_size=256 -#训练step -train_steps= -#学习率 -learning_rate= - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False + +# 路径参数初始化 +data_path="" +output_path="" # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1p.sh " + echo"usage:./train_performance_1P.sh " echo " " echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message " exit 1 fi -#参数校验,不需要修改 +# 参数校验,不需要修改 for para in $* do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/test/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/test/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/test/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then + if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` - - elif [[ $para == --ckpt_path* ]];then - ckpt_path=`echo ${para#*=}` + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` fi done -#校验是否传入data_path,不需要修改 +# 校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" + echo "[Error] para \"data_path\" must be config" exit 1 - fi -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - echo "Device ID: $ASCEND_DEVICE_ID" - export RANK_ID=$RANK_ID - - +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt +# 设置打屏日志文件名,请保留,文件名为${print_log} +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +modelarts_flag=${MODELARTS_MODEL_PATH} +if [ x"${modelarts_flag}" != x ]; +then + echo "running without etp..." + print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` + print_log="/home/ma-user/modelarts/log/${print_log_name}" +fi +echo "### get your log here : ${print_log}" + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' else - mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' fi +} - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} - python3 train_cifar10.py \ - --data_path=${data_path} > ${cur_path}test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 基础参数,需要模型审视修改 +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +batch_size=100 +train_steps=40000 +if [ x"${modelarts_flag}" != x ]; +then + python3.7 ./train_cifar10.py --data_path=${data_path} --output_path=${output_path} --training_length=${train_steps} +else + python3.7 ./train_cifar10.py --data_path=${data_path} --output_path=${output_path} --training_length=${train_steps} 1>${print_log} 2>&1 +fi + +# 性能相关数据计算 +StepTime=`grep "Perf: " ${print_log} |awk 'END {print $11}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${StepTime}'}'` + +# 精度相关数据计算 +train_accuracy=`grep "Evaluate" ${print_log} |awk 'END {print $5}'|sed 's/,//g'|sed 's/%//g'` +# 提取所有loss打印信息 +grep "train/class_cost/1:" ${print_log} | awk '{print $7}'|sed 's/,//g'|sed 's/%//g' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 判断本次执行是否正确使用Ascend NPU +use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` +if [ x"${use_npu_flag}" == x0 ]; +then + echo "------------------ ERROR NOTICE START ------------------" + echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." + echo "------------------ ERROR NOTICE END------------------" +else + echo "------------------ INFO NOTICE START------------------" + echo "INFO, your task have used Ascend NPU, please check your result." + echo "------------------ INFO NOTICE END------------------" +fi +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename -done -wait +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi -#训练结束时间,不需要修改 +# 训练端到端耗时 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -#结果打印,不需要修改 echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -TrainingTime=`grep "Perf: " $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk 'END {print $11}'` - - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${TrainingTime}'}'` - -#获取模型精度,该网络为错误率 -train_accuracy=`grep "train/error" $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk 'END {print $5}'|sed 's/,//g'|sed 's/%//g'` +# 输出性能FPS/单step耗时/端到端耗时 +echo "Final Performance images/sec : $FPS" +echo "Final Performance sec/step : $StepTime" +echo "E2E Training Duration sec : $e2e_time" -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep 'train/class_cost' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $7}'|sed 's/,//g'|sed 's/%//g' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +# 输出训练精度 +echo "Final Train Accuracy : $train_accuracy" -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/test/train_performance_1p.sh index 6fdadae8b01615bc7f817d026303e53bf3d10756..fbbb4a26962b7c134e975a81da21830be7f6825b 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/test/train_performance_1p.sh @@ -1,161 +1,187 @@ #!/bin/bash -#当前路径,不需要修改 -cur_path=`pwd`/../ +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` -#集合通信参数,不需要修改 +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` export RANK_SIZE=1 +export RANK_ID=0 export JOB_ID=10087 -RANK_ID_START=0 - -# 数据集路径,保持为空,不需要修改 -data_path='' -ckpt_path='' - -#设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 -#export ASCEND_DEVICE_ID=3 - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="MEAN-TEACHER_ID0789_for_TensorFlow" -#训练epoch -train_epochs= -#训练batch_size -batch_size=256 -#训练step -train_steps=100 -#学习率 -learning_rate= - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False +# 路径参数初始化 +data_path="" +output_path="" # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1p.sh " + echo"usage:./train_performance_1P.sh " echo " " echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message " exit 1 fi -#参数校验,不需要修改 +# 参数校验,不需要修改 for para in $* do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/test/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/test/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/test/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then + if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` - - elif [[ $para == --ckpt_path* ]];then - ckpt_path=`echo ${para#*=}` + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` fi done -#校验是否传入data_path,不需要修改 +# 校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" + echo "[Error] para \"data_path\" must be config" exit 1 +fi +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" fi -#训练开始时间,不需要修改 -start_time=$(date +%s) +# 设置打屏日志文件名,请保留,文件名为${print_log} +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +modelarts_flag=${MODELARTS_MODEL_PATH} +if [ x"${modelarts_flag}" != x ]; +then + echo "running with modelarts..." + print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` + print_log="/home/ma-user/modelarts/log/${print_log_name}" +fi +echo "### get your log here : ${print_log}" + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} -#进入训练脚本目录,需要模型审视修改 -cd $cur_path -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - echo "Device ID: $ASCEND_DEVICE_ID" - export RANK_ID=$RANK_ID +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 基础参数,需要模型审视修改 +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +train_epochs=1 +train_steps=100 +batch_size=100 +if [ x"${modelarts_flag}" != x ]; +then + python3.7 ./train_cifar10.py --data_path=${data_path} --output_path=${output_path} --training_length=${train_steps} +else + python3.7 ./train_cifar10.py --data_path=${data_path} --output_path=${output_path} --training_length=${train_steps} 1>${print_log} 2>&1 +fi - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt - fi +# 性能相关数据计算 +StepTime=`grep "Perf: " ${print_log} |awk 'END {print $11}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${StepTime}'}'` - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +# 精度相关数据计算 +train_accuracy=`grep "Evaluate" ${print_log} |awk 'END {print $5}'|sed 's/,//g'|sed 's/%//g'` - python3 train_cifar10.py \ - --data_path=${data_path} \ - --training_length=${train_steps} > ${cur_path}test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 +# 提取所有loss打印信息 +grep "train/class_cost/1:" ${print_log} | awk '{print $7}'|sed 's/,//g'|sed 's/%//g' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt -done -wait -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -TrainingTime=`grep "Perf: " $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk 'END {print $11}'` +# 判断本次执行是否正确使用Ascend NPU +use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` +if [ x"${use_npu_flag}" == x0 ]; +then + echo "------------------ ERROR NOTICE START ------------------" + echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." + echo "------------------ ERROR NOTICE END------------------" +else + echo "------------------ INFO NOTICE START------------------" + echo "INFO, your task have used Ascend NPU, please check your result." + echo "------------------ INFO NOTICE END------------------" +fi +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${TrainingTime}'}'` +# 训练端到端耗时 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) -#获取模型精度,该网络为错误率 -train_accuracy=`grep "train/error" $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk 'END {print $5}'|sed 's/,//g'|sed 's/%//g'` +echo "------------------ Final result ------------------" +# 输出性能FPS/单step耗时/端到端耗时 +echo "Final Performance images/sec : $FPS" +echo "Final Performance sec/step : $StepTime" +echo "E2E Training Duration sec : $e2e_time" -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep 'train/class_cost' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $7}'|sed 's/,//g'|sed 's/%//g' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +# 输出训练精度 +echo "Final Train Accuracy : $train_accuracy" -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/train_cifar10.py b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/train_cifar10.py index f81d12444739e13f0a4c2966bc4425d6e2fb92a1..58bea02f6065d0a8830700847d3c7066f9146c99 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/train_cifar10.py +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/train_cifar10.py @@ -1,68 +1,70 @@ -# Copyright (c) 2018, Curious AI Ltd. All rights reserved. -# -# This work is licensed under the Creative Commons Attribution-NonCommercial -# 4.0 International License. To view a copy of this license, visit -# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to -# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. -# ============================================================================ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Train ConvNet Mean Teacher on CIFAR-10 training set and evaluate against a validation set - -This runner converges quickly to a fairly good accuracy. -On the other hand, the runner experiments/cifar10_final_eval.py -contains the hyperparameters used in the paper, and converges -much more slowly but possibly to a slightly better accuracy. -""" -from npu_bridge.npu_init import * -import argparse -import logging - -from experiments.run_context import RunContext -from datasets import Cifar10ZCA -from mean_teacher.model import Model -from mean_teacher import minibatching - -logging.basicConfig(level=logging.INFO) -LOG = logging.getLogger('main') -parser = argparse.ArgumentParser() -parser.add_argument('--data_path',type=str, default='data',help='The path of dataset') -parser.add_argument('--n_labeled',type=int, default=4000,help='The num of labeled images') -parser.add_argument('--training_length',type=int, default=40000,help='The steps o training') - -#running function -def run(data_seed, args): - n_labeled = args.n_labeled - data_path = args.data_path - model = Model(RunContext(__file__, 0)) - model['flip_horizontally'] = True - model['normalize_input'] = False # Keep ZCA information - model['rampdown_length'] = 0 - model['rampup_length'] = 5000 - model['training_length'] = args.training_length - model['max_consistency_cost'] = 50.0 - - tensorboard_dir = model.save_tensorboard_graph() - LOG.info("Saved tensorboard graph to %r", tensorboard_dir) - - cifar = Cifar10ZCA(data_seed, n_labeled, data_path) - training_batches = minibatching.training_batches(cifar.training, n_labeled_per_batch=50) - evaluation_batches_fn = minibatching.evaluation_epoch_generator(cifar.evaluation) - - model.train(training_batches, evaluation_batches_fn) - -if __name__ == "__main__": - args = parser.parse_args() - run(0,args) - +# Copyright (c) 2018, Curious AI Ltd. All rights reserved. +# +# This work is licensed under the Creative Commons Attribution-NonCommercial +# 4.0 International License. To view a copy of this license, visit +# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to +# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Train ConvNet Mean Teacher on CIFAR-10 training set and evaluate against a validation set + +This runner converges quickly to a fairly good accuracy. +On the other hand, the runner experiments/cifar10_final_eval.py +contains the hyperparameters used in the paper, and converges +much more slowly but possibly to a slightly better accuracy. +""" +from npu_bridge.npu_init import * +import argparse +import logging + +from experiments.run_context import RunContext +from datasets import Cifar10ZCA +from mean_teacher.model import Model +from mean_teacher import minibatching + +logging.basicConfig(level=logging.INFO) +LOG = logging.getLogger('main') +parser = argparse.ArgumentParser() +parser.add_argument('--data_path',type=str, default='data',help='The path of dataset') +parser.add_argument('--n_labeled',type=int, default=4000,help='The num of labeled images') +parser.add_argument('--training_length',type=int, default=40000,help='The steps o training') +parser.add_argument('--output_path',type=str, default='output',help='The path of output') + +#running function +def run(data_seed, args): + n_labeled = args.n_labeled + data_path = args.data_path + output_path = args.output_path + model = Model(RunContext(__file__, 0, output_path)) + model['flip_horizontally'] = True + model['normalize_input'] = False # Keep ZCA information + model['rampdown_length'] = 0 + model['rampup_length'] = 5000 + model['training_length'] = args.training_length + model['max_consistency_cost'] = 50.0 + + tensorboard_dir = model.save_tensorboard_graph() + LOG.info("Saved tensorboard graph to %r", tensorboard_dir) + + cifar = Cifar10ZCA(data_seed, n_labeled, data_path) + training_batches = minibatching.training_batches(cifar.training, n_labeled_per_batch=50) + evaluation_batches_fn = minibatching.evaluation_epoch_generator(cifar.evaluation) + + model.train(training_batches, evaluation_batches_fn) + +if __name__ == "__main__": + args = parser.parse_args() + run(0,args) + diff --git a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/train_svhn.py b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/train_svhn.py index b10d7a5e19c6b96ca1a6184e47d88ed397221003..812315e323a57d9f7c6e4c65431bcb76d2e186d7 100644 --- a/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/train_svhn.py +++ b/TensorFlow/contrib/cv/MEAN-TEACHER_ID0789_for_TensorFlow/train_svhn.py @@ -1,65 +1,65 @@ -# Copyright (c) 2018, Curious AI Ltd. All rights reserved. -# -# This work is licensed under the Creative Commons Attribution-NonCommercial -# 4.0 International License. To view a copy of this license, visit -# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to -# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. -# ============================================================================ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Train ConvNet Mean Teacher on SVHN training set and evaluate against a validation set - -This runner converges quickly to a fairly good accuracy. -On the other hand, the runner experiments/svhn_final_eval.py -contains the hyperparameters used in the paper, and converges -much more slowly but possibly to a slightly better accuracy. -""" -from npu_bridge.npu_init import * - -import logging -from datetime import datetime - -from experiments.run_context import RunContext -from datasets import SVHN -from mean_teacher.model import Model -from mean_teacher import minibatching - - -logging.basicConfig(level=logging.INFO) -LOG = logging.getLogger('main') - - -def run(data_seed=0): - n_labeled = 500 - n_extra_unlabeled = 0 - - model = Model(RunContext(__file__, 0)) - model['rampdown_length'] = 0 - model['rampup_length'] = 5000 - model['training_length'] = 40000 - model['max_consistency_cost'] = 50.0 - - tensorboard_dir = model.save_tensorboard_graph() - LOG.info("Saved tensorboard graph to %r", tensorboard_dir) - - svhn = SVHN(data_seed, n_labeled, n_extra_unlabeled) - training_batches = minibatching.training_batches(svhn.training, n_labeled_per_batch=50) - evaluation_batches_fn = minibatching.evaluation_epoch_generator(svhn.evaluation) - - model.train(training_batches, evaluation_batches_fn) - - -if __name__ == "__main__": - run() - +# Copyright (c) 2018, Curious AI Ltd. All rights reserved. +# +# This work is licensed under the Creative Commons Attribution-NonCommercial +# 4.0 International License. To view a copy of this license, visit +# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to +# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Train ConvNet Mean Teacher on SVHN training set and evaluate against a validation set + +This runner converges quickly to a fairly good accuracy. +On the other hand, the runner experiments/svhn_final_eval.py +contains the hyperparameters used in the paper, and converges +much more slowly but possibly to a slightly better accuracy. +""" +from npu_bridge.npu_init import * + +import logging +from datetime import datetime + +from experiments.run_context import RunContext +from datasets import SVHN +from mean_teacher.model import Model +from mean_teacher import minibatching + + +logging.basicConfig(level=logging.INFO) +LOG = logging.getLogger('main') + + +def run(data_seed=0): + n_labeled = 500 + n_extra_unlabeled = 0 + + model = Model(RunContext(__file__, 0, './output')) + model['rampdown_length'] = 0 + model['rampup_length'] = 5000 + model['training_length'] = 40000 + model['max_consistency_cost'] = 50.0 + + tensorboard_dir = model.save_tensorboard_graph() + LOG.info("Saved tensorboard graph to %r", tensorboard_dir) + + svhn = SVHN(data_seed, n_labeled, n_extra_unlabeled) + training_batches = minibatching.training_batches(svhn.training, n_labeled_per_batch=50) + evaluation_batches_fn = minibatching.evaluation_epoch_generator(svhn.evaluation) + + model.train(training_batches, evaluation_batches_fn) + + +if __name__ == "__main__": + run() +