From 73e2ede6524bbec3be72d040709d3e1d428013ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E8=8E=89?= <2020857336@qq.com> Date: Tue, 28 Nov 2023 05:33:27 +0000 Subject: [PATCH 1/2] =?UTF-8?q?8p=E7=94=A8=E4=BE=8B=E8=B5=B0=E9=9D=99?= =?UTF-8?q?=E6=80=81=E6=97=A0=E9=9C=80=E8=BD=ACV2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 石莉 <2020857336@qq.com> --- .../test/train_performance_8p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_performance_8p.sh index 74b47dffd..4b1e9add7 100644 --- a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/test/train_performance_8p.sh @@ -119,7 +119,7 @@ for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" fi - nohup ${bind_core} python3 -u ./object_detection/model_main.py \ + nohup ${bind_core} python3 -u ./object_detection/model_main_8p.py \ --pipeline_config_path=${pipeline_config} \ --model_dir=$cur_path/output/${ASCEND_DEVICE_ID_START} \ --data_path=${data_path} \ -- Gitee From 4a08378f5c0cb3b6d31fcd8844c692de2d206348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E8=8E=89?= <2020857336@qq.com> Date: Tue, 28 Nov 2023 05:36:04 +0000 Subject: [PATCH 2/2] =?UTF-8?q?8p=E7=94=A8=E4=BE=8B=E8=B5=B0=E9=9D=99?= =?UTF-8?q?=E6=80=81=E6=97=A0=E9=9C=80=E8=BD=ACV2=EF=BC=8C=E6=89=A7?= =?UTF-8?q?=E8=A1=8C=E5=8E=9F=E5=A7=8B=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 石莉 <2020857336@qq.com> --- .../object_detection/model_main_8p.py | 175 ++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_main_8p.py diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_main_8p.py b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_main_8p.py new file mode 100644 index 000000000..b2706af1f --- /dev/null +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet50V1-FPN_ID1463_for_TensorFlow/models/research/object_detection/model_main_8p.py @@ -0,0 +1,175 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +'Binary to run train and evaluation on object detection model.' +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from tensorflow.core.protobuf import config_pb2 +from absl import flags +import tensorflow as tf +#import horovod.tensorflow as hvd +import dllogger +import time +import os +from object_detection import model_hparams +from object_detection import model_lib +from object_detection.utils.exp_utils import AverageMeter, setup_dllogger + +flags.DEFINE_string('model_dir', None, 'Path to output model directory where event and checkpoint files will be written.') +flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config file.') +flags.DEFINE_string('raport_file', default='summary.json', help='Path to dlloger json') +flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.') +flags.DEFINE_boolean('eval_training_data', False, 'If training data should be evaluated for this job. Note that one call only use this in eval-only mode, and `checkpoint_dir` must be supplied.') +flags.DEFINE_integer('sample_1_of_n_eval_examples', 1, 'Will sample one of every n eval input examples, where n is provided.') +flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample one of every n train input examples for evaluation, where n is provided. This is only used if `eval_training_data` is True.') +flags.DEFINE_integer('eval_count', 1, 'How many times the evaluation should be run') +flags.DEFINE_string('hparams_overrides', None, 'Hyperparameter overrides, represented as a string containing comma-separated hparam_name=value pairs.') +flags.DEFINE_string('checkpoint_dir', None, 'Path to directory holding a checkpoint. If `checkpoint_dir` is provided, this binary operates in eval-only mode, writing resulting metrics to `model_dir`.') +flags.DEFINE_boolean('allow_xla', False, 'Enable XLA compilation') +flags.DEFINE_boolean('amp', False, 'Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.') +flags.DEFINE_boolean('run_once', False, 'If running in eval-only mode, whether to run just one round of eval vs running continuously (default).') +############################NPU_modify add######################################## +flags.DEFINE_boolean('overflow_dump', False, 'Enable overflow op detection') +flags.DEFINE_string('overflow_dump_path', None, 'Path to directory dump overflow ops data.') +flags.DEFINE_boolean('check_loss_scale', False, 'check whether loss scale is valid') +flags.DEFINE_boolean('step_dump', False, 'Enable dump step data, can only set when overflow_dump is not set') +flags.DEFINE_string('step_dump_path', None, 'Path to directory dump step0 ops data.') +flags.DEFINE_boolean('skip_eval', False, 'Whether to skip eval') +############################NPU_modify end######################################## +FLAGS = flags.FLAGS + + +class DLLoggerHook(tf.estimator.SessionRunHook): + + def __init__(self, global_batch_size, rank=(- 1)): + self.global_batch_size = global_batch_size + self.rank = rank + setup_dllogger(enabled=True, filename=FLAGS.raport_file, rank=rank) + + def after_create_session(self, session, coord): + self.meters = {} + warmup = 100 + self.meters['train_throughput'] = AverageMeter(warmup=warmup) + + def before_run(self, run_context): + self.t0 = time.time() + return tf.estimator.SessionRunArgs(fetches=['global_step:0', 'learning_rate:0']) + + def after_run(self, run_context, run_values): + throughput = (self.global_batch_size / (time.time() - self.t0)) + (global_step, lr) = run_values.results + self.meters['train_throughput'].update(throughput) + + def end(self, session): + summary = {'train_throughput': self.meters['train_throughput'].avg} + dllogger.log(step=tuple(), data=summary) + +###############################NPU_modify add##################################### +class _LogSessionRunHook(tf.train.SessionRunHook): + def before_run(self, run_context): + return tf.estimator.SessionRunArgs(fetches=['overflow_status_reduce_all:0', 'loss_scale:0']) + + def after_run(self, run_context, run_values): + if not run_values.results[0]: + print('Find overflow in this step, skip apply gradients, loss scale value=%d' % run_values.results[1],flush=True) + else: + print('Apply gradients, loss scale value=%d' % run_values.results[1],flush=True) +###############################NPU_modify end##################################### +def main(unused_argv): + tf.logging.set_verbosity(tf.logging.INFO) + #tf的混合精度 + if FLAGS.amp: + os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' + else: + os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '0' + flags.mark_flag_as_required('model_dir') + flags.mark_flag_as_required('pipeline_config_path') + if True: + session_config = npu_config_proto(config_proto=tf.ConfigProto()) + + session_config.gpu_options.per_process_gpu_memory_fraction = 0.9 + session_config.gpu_options.visible_device_list = str(get_npu_local_rank_id()) + if FLAGS.allow_xla: + if True: + session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 + model_dir = (FLAGS.model_dir if (get_npu_rank_id() == 0) else None) + config = tf.estimator.RunConfig(model_dir=model_dir, session_config=session_config) + + train_and_eval_dict = model_lib.create_estimator_and_inputs(run_config=config, eval_count=FLAGS.eval_count, hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), pipeline_config_path=FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=FLAGS.sample_1_of_n_eval_on_train_examples) + estimator = train_and_eval_dict['estimator'] + train_input_fn = train_and_eval_dict['train_input_fn'] + eval_input_fns = train_and_eval_dict['eval_input_fns'] + eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] + predict_input_fn = train_and_eval_dict['predict_input_fn'] + train_steps = train_and_eval_dict['train_steps'] + if FLAGS.checkpoint_dir: + if FLAGS.eval_training_data: + name = 'training_data' + input_fn = eval_on_train_input_fn + else: + name = 'validation_data' + input_fn = eval_input_fns[0] + #if FLAGS.run_once: + # estimator.evaluate(input_fn, steps=None, checkpoint_path=tf.train.latest_checkpoint(FLAGS.checkpoint_dir)) + #else: + # model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn, train_steps, name) + else: + (train_spec, eval_specs) = model_lib.create_train_and_eval_specs(train_input_fn, eval_input_fns, eval_on_train_input_fn, predict_input_fn, train_steps, eval_on_train_data=False) + ##################################NPU_modify add################################### + if FLAGS.check_loss_scale: + train_hooks = [NpuEmptyHook(), DLLoggerHook((get_rank_size() * train_and_eval_dict['train_batch_size']), get_npu_rank_id()),_LogSessionRunHook()] + else: + train_hooks = [NpuEmptyHook(), DLLoggerHook((get_rank_size() * train_and_eval_dict['train_batch_size']), get_npu_rank_id())] + #train_hooks = [NpuEmptyHook(), DLLoggerHook((get_rank_size() * train_and_eval_dict['train_batch_size']), get_rank_id())] + ##################################NPU_modify end################################### + eval_hooks = [] + for x in range(FLAGS.eval_count): + estimator.train(train_input_fn, hooks=npu_hooks_append(hooks_list=train_hooks), steps=(train_steps // FLAGS.eval_count)) + if (get_npu_rank_id() == 0): + eval_input_fn = eval_input_fns[0] + #eval阻塞,临时规避 + if FLAGS.skip_eval: + print("[debug]skip eval.") + else: + print("[debug]enter eval process ...") + results = estimator.evaluate(eval_input_fn, steps=None, hooks=eval_hooks) + +if (__name__ == '__main__'): + session_config = tf.ConfigProto() + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + # custom_op.parameter_map["mix_compile_mode"].b = True + custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("../../configs/ops_info.json") + (npu_sess, npu_shutdown) = init_resource(config=session_config) + tf.app.run() + shutdown_resource(npu_sess, npu_shutdown) + close_session(npu_sess) \ No newline at end of file -- Gitee