diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/run_classifier.py b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/run_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..075b28edda66b6d499e80ae3aab94cdf167af2a6 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/run_classifier.py @@ -0,0 +1,862 @@ +# coding=utf-8 +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import collections +import csv +import os +import modeling +import optimization +import tokenization +import tensorflow as tf +#import horovod.tensorflow as hvd +import time, sys +from npu_bridge.estimator.npu.npu_estimator import NPUEstimator,NPUstimatorSpec +from utils.utils import LogEvalRunHook, LogTrainRunHook, setup_xla_flags +#from utils.gpu_affinity import set_affinity +import utils.dllogger_class +from dllogger import Verbosity +from utils.create_glue_data import * +import numpy as np +import tf_metrics + +os.environ['GE_USE_STATIC_MEMORY'] = '1' + +rank_size = int(os.getenv('RANK_SIZE')) +rank_id = int(os.getenv('RANK_ID')) + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +flags.DEFINE_bool('npu_gather', False, 'Whether to use gather_npu whose backward propagation avoids IndexedSlices') + +flags.DEFINE_bool('npu_bert_debug', False, 'If True, dropout and shuffle is disabled.') + +flags.DEFINE_bool('npu_bert_npu_dropout', True, 'Whether to use npu defined dropout op') + +flags.DEFINE_bool('npu_bert_fused_gelu', True, 'Whether to use npu defined gelu op') + +flags.DEFINE_bool('use_fast_gelu', False, 'use fast gelu instead gelu') + +flags.DEFINE_bool("use_fp16_cls", False, "Whether to use fp16 in cls and pooler.") + +flags.DEFINE_integer('init_loss_scale_value', 2**32, 'Initial loss scale value for loss scale optimizer') + +flags.DEFINE_bool('npu_bert_tail_optimize', False, 'Whether to use npu allreduce tail optimization') + +flags.DEFINE_bool("npu_bert_clip_by_global_norm", True, "Use clip_by_global_norm if True, or use clip_by_norm for each gradient") + +flags.DEFINE_bool('npu_bert_use_fused_adam_momentum', False, 'Whether to use fused apply and assign in adam') + +flags.DEFINE_bool('npu_bert_use_fused_lamb_momentum', False, 'Whether to use fused apply and assign in lamb') + +flags.DEFINE_bool('npu_bert_npu_dropout_v3', False, 'Whether to use npu defined dropout_v3 op') +## Other parameters +flags.DEFINE_string( + "dllog_path", "bert_dllog.json", + "filename where dllogger writes to") + +flags.DEFINE_string( + "optimizer_type", "lamb", + "Optimizer type : adam or lamb") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("npu_bert_loss_scale", 0, "Whether to use loss scale.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_bool("use_trt", False, "Whether to use TF-TRT") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_bool("distributed", False, "Use multiple NPUs for training.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") +flags.DEFINE_integer("display_loss_steps", 10, + "How often to print loss from estimator") + +flags.DEFINE_integer("iterations_per_loop", 1, + "How many steps to make in each estimator call.") +flags.DEFINE_integer("num_accumulation_steps", 1, + "Number of accumulation steps before gradient update" + "Global batch size = num_accumulation_steps * train_batch_size") +flags.DEFINE_bool("amp", True, "Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.") +flags.DEFINE_bool("use_xla", True, "Whether to enable XLA JIT compilation.") +flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs") + +flags.DEFINE_bool( + "verbose_logging", False, + "If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + +#npu parameter +flags.DEFINE_string("precision_mode", "allow_mix_precision", "Npu Precision Mode") +flags.DEFINE_bool( + "enable_exception_dump", False, + "Whether to enable excepttion dump.") +flags.DEFINE_bool("data_dump_flag", False,"Whether to dump data.") +flags.DEFINE_string("data_dump_step", "0", + "How many steps to dump data.") +flags.DEFINE_string("data_dump_path", "./output/data_dump", + "path to dump data.") +flags.DEFINE_bool("over_dump", False,"Whether to over_dump.") +flags.DEFINE_string("over_dump_path", "./output/doverflow_dump", + "path to dump overflow data.") + +##################NPU_modify start############################# +flags.DEFINE_bool("over_dump", False, "Whether check overflow") +flags.DEFINE_bool("data_dump_flag", False, "whether dump data") +flags.DEFINE_string("data_dump_step", "0", "Only used if `data_dump_flag` is True") +flags.DEFINE_string("over_dump_path", "test/output/overflow_dump", "Only used if `over_dump` is True") +flags.DEFINE_string("data_dump_path", "test/output/data_dump", "Only used if `data_dump_flag` is True") +flags.DEFINE_bool("autotune", False, "Whether autotune") +flags.DEFINE_bool("profiling", False, "Whether profiling") +flags.DEFINE_string("profiling_dump_path", "test/output/profiling_path", "Only used if `profiling` is True") + +def file_based_input_fn_builder(input_file, batch_size, seq_length, is_training, + drop_remainder, hvd=None): + """Creates an `input_fn` closure to be passed to Estimator.""" + + name_to_features = { + "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.io.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(): + """The actual input function.""" + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + if FLAGS.distributed: + d = d.shard(rank_size, rank_id) + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=True)) + + return d + + return input_fn + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + compute_type=tf.float32) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = npu_ops.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias, name='cls_logits') + probabilities = tf.nn.softmax(logits, axis=-1, name='cls_probabilities') + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1, name='cls_per_example_loss') + loss = tf.reduce_mean(per_example_loss, name='cls_loss') + + return (loss, per_example_loss, logits, probabilities) + +def get_frozen_tftrt_model(bert_config, shape, num_labels, use_one_hot_embeddings, init_checkpoint): + tf_config = tf.compat.v1.ConfigProto() + tf_config.gpu_options.allow_growth = True + output_node_names = ['loss/cls_loss', 'loss/cls_per_example_loss', 'loss/cls_logits', 'loss/cls_probabilities'] + + with tf.Session(config=npu_config_proto(config_proto=tf_config)) as tf_sess: + input_ids = tf.placeholder(tf.int32, shape, 'input_ids') + input_mask = tf.placeholder(tf.int32, shape, 'input_mask') + segment_ids = tf.placeholder(tf.int32, shape, 'segment_ids') + label_ids = tf.placeholder(tf.int32, (None), 'label_ids') + + create_model(bert_config, False, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf_sess.run(tf.global_variables_initializer()) + print("LOADED!") + tf.compat.v1.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + else: + init_string = ", *NOTTTTTTTTTTTTTTTTTTTTT" + tf.compat.v1.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) + + frozen_graph = tf.graph_util.convert_variables_to_constants(tf_sess, + tf_sess.graph.as_graph_def(), output_node_names) + + num_nodes = len(frozen_graph.node) + print('Converting graph using TensorFlow-TensorRT...') + from tensorflow.python.compiler.tensorrt import trt_convert as trt + converter = trt.TrtGraphConverter( + input_graph_def=frozen_graph, + nodes_blacklist=output_node_names, + max_workspace_size_bytes=(4096 << 20) - 1000, + precision_mode = "FP16" if FLAGS.amp else "FP32", + minimum_segment_size=4, + is_dynamic_op=True, + maximum_cached_engines=1000 + ) + frozen_graph = converter.convert() + + print('Total node count before and after TF-TRT conversion:', + num_nodes, '->', len(frozen_graph.node)) + print('TRT node count:', + len([1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])) + + with tf.io.gfile.GFile("frozen_modelTRT.pb", "wb") as f: + f.write(frozen_graph.SerializeToString()) + + return frozen_graph + + + +def model_fn_builder(task_name, bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, + use_one_hot_embeddings, hvd=None): + """Returns `model_fn` closure for Estimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for Estimator.""" + + def metric_fn(per_example_loss, label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + if task_name == "cola": + FN, FN_op = tf.metrics.false_negatives(labels=label_ids, predictions=predictions) + FP, FP_op = tf.metrics.false_positives(labels=label_ids, predictions=predictions) + TP, TP_op = tf.metrics.true_positives(labels=label_ids, predictions=predictions) + TN, TN_op = tf.metrics.true_negatives(labels=label_ids, predictions=predictions) + + MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5 + MCC_op = tf.group(FN_op, TN_op, TP_op, FP_op, tf.identity(MCC, name="MCC")) + return {"MCC": (MCC, MCC_op)} + elif task_name == "mrpc": + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions) + loss = tf.metrics.mean(values=per_example_loss) + f1 = tf_metrics.f1(labels=label_ids, predictions=predictions, num_classes=2, pos_indices=[1]) + return { + "eval_accuracy": accuracy, + "eval_f1": f1, + "eval_loss": loss, + } + else: + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions) + loss = tf.metrics.mean(values=per_example_loss) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + tf.compat.v1.logging.info("*** Features ***") + tf.compat.v1.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.compat.v1.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + if not is_training and FLAGS.use_trt: + trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape, num_labels, use_one_hot_embeddings, init_checkpoint) + (total_loss, per_example_loss, logits, probabilities) = tf.import_graph_def(trt_graph, + input_map={'input_ids':input_ids, 'input_mask':input_mask, 'segment_ids':segment_ids, 'label_ids':label_ids}, + return_elements=['loss/cls_loss:0', 'loss/cls_per_example_loss:0', 'loss/cls_logits:0', 'loss/cls_probabilities:0'], + name='') + if mode == tf.estimator.ModeKeys.PREDICT: + predictions = {"probabilities": probabilities} + output_spec = tf.estimator.EstimatorSpec( + mode=mode, predictions=predictions) + elif mode == tf.estimator.ModeKeys.EVAL: + eval_metric_ops = metric_fn(per_example_loss, label_ids, logits) + output_spec = tf.estimator.EstimatorSpec( + mode=mode, + loss=total_loss, + eval_metric_ops=eval_metric_ops) + return output_spec + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + if init_checkpoint and (hvd is None or get_npu_rank_id() == 0): + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + if FLAGS.verbose_logging: + tf.compat.v1.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.compat.v1.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, + hvd, False, FLAGS.amp, FLAGS.num_accumulation_steps, FLAGS.optimizer_type) + output_spec = tf.estimator.EstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op) + elif mode == tf.estimator.ModeKeys.EVAL: + dummy_op = tf.no_op() + # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite + if FLAGS.amp: + loss_scaler = FixedLossScaleManager(1) + dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite( + optimization.LAMBOptimizer(learning_rate=0.0), loss_scaler) + eval_metric_ops = metric_fn(per_example_loss, label_ids, logits) + output_spec = tf.estimator.EstimatorSpec( + mode=mode, + loss=total_loss, + eval_metric_ops=eval_metric_ops) + else: + dummy_op = tf.no_op() + # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite + if FLAGS.amp: + dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite( + optimization.LAMBOptimizer(learning_rate=0.0)) + output_spec = tf.estimator.EstimatorSpec( + mode=mode, predictions=probabilities) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, batch_size, seq_length, is_training, drop_remainder, hvd=None): + """Creates an `input_fn` closure to be passed to Estimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(): + """The actual input function.""" + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + if hvd is not None: d = d.shard(get_rank_size(), get_rank_id()) + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=True) + return d + + return input_fn + + +def main(_): + + setup_xla_flags() + + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) + dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) + + if FLAGS.horovod: + print() + + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mrpc": MrpcProcessor, + "xnli": XnliProcessor, + } + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.io.gfile.makedirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + master_process = True + training_hooks = [] + global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps + hvd_rank = 0 + + config_proto = tf.compat.v1.ConfigProto() + + if FLAGS.distributed: + tf.compat.v1.logging.info("Multi-NPU training with Npu") + tf.compat.v1.logging.info("rank.size() = %d rank.id() = %d", rank_size, rank_id) + global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * rank_size + master_process = (rank_id == 0) + hvd_rank = rank_id + config_proto.gpu_options.visible_device_list = str(rank_id) + # set_affinity(get_npu_local_rank_id()) + #if get_npu_rank_size() > 1: + # training_hooks.append(NpuEmptyHook()) + + if FLAGS.use_xla: + config_proto.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 + if FLAGS.amp: + tf.enable_resource_variables() + + auto_tune_mode = None + profiling_config = ProfilingConfig(enable_profiling=False, profiling_options=None) + dump_config = DumpConfig(enable_dump=False, dump_path=None, dump_step=None, dump_mode="output", + enable_dump_debug=False, dump_debug_mode="all") + + if FLAGS.autotune: + auto_tune_mode = "RL,GA" + if FLAGS.profiling: + profiling_config = ProfilingConfig( + enable_profiling = True, + profiling_options = '{"output":"%s",\ + "training_trace":"on",\ + "task_trace":"on",\ + "aicpu":"on",\ + "aic_metrics":"PipeUtilization"}' % FLAGS.profiling_dump_path + ) + + if FLAGS.over_dump: + dump_config = DumpConfig(dump_path=FLAGS.over_dump_path, enable_dump_debug=True, dump_debug_mode="all") + if FLAGS.data_dump_flag: + dump_config = DumpConfig(enable_dump=True, dump_path=FLAGS.data_dump_path, dump_step=FLAGS.data_dump_step, dump_mode="all") + + run_config = NPURunConfig( + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else 0, + iterations_per_loop=FLAGS.iterations_per_loop, + session_config=config_proto, + hcom_parallel=True, + precision_mode=FLAGS.precision_mode, + keep_checkpoint_max=5, + log_step_count_steps=10, + auto_tune_mode=auto_tune_mode, + profiling_config=profiling_config, + dump_config=dump_config + ) + + if master_process: + tf.compat.v1.logging.info("***** Configuaration *****") + for key in FLAGS.__flags.keys(): + tf.compat.v1.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) + tf.compat.v1.logging.info("**************************") + + train_examples = None + num_train_steps = None + num_warmup_steps = None + training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps, num_steps_ignore_xla=25)) + + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + + num_train_steps = int( + len(train_examples) / global_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + start_index = 0 + end_index = len(train_examples) + tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] + + if FLAGS.distributed: + tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(rank_size)] + num_examples_per_rank = len(train_examples) // rank_size + remainder = len(train_examples) % rank_size + if rank_id < remainder: + start_index = rank_id * (num_examples_per_rank + 1) + end_index = start_index + num_examples_per_rank + 1 + else: + start_index = rank_id * num_examples_per_rank + remainder + end_index = start_index + (num_examples_per_rank) + + model_fn = model_fn_builder( + task_name=task_name, + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate if not FLAGS.distributed else FLAGS.learning_rate * rank_size, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_one_hot_embeddings=False, + distributed=FLAGS.distributed, + hvd=None if not FLAGS.horovod else hvd) + + + estimator = NPUEstimator( + model_fn=model_fn, + config=run_config) + + if FLAGS.do_train: + file_based_convert_examples_to_features( + train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) + + tf.compat.v1.logging.info("***** Running training *****") + tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) + tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=tmp_filenames, + batch_size=FLAGS.train_batch_size, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True, + hvd=None if not FLAGS.horovod else hvd) + + train_start_time = time.time() + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=npu_hooks_append(hooks_list=training_hooks)) + train_time_elapsed = time.time() - train_start_time + #train_time_wo_overhead = training_hooks[-2].total_time + avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed + #ss_sentences_per_second = (training_hooks[-2].count - training_hooks[-2].skipped) * global_batch_size * 1.0 / train_time_wo_overhead + + if master_process: + tf.compat.v1.logging.info("-----------------------------") + tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, + num_train_steps * global_batch_size) + #tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, + # (training_hooks[-2].count - training_hooks[-2].skipped) * global_batch_size) + tf.compat.v1.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) + #tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) + tf.compat.v1.logging.info("-----------------------------") + + if FLAGS.do_eval and master_process: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.compat.v1.logging.info("***** Running evaluation *****") + tf.compat.v1.logging.info(" Num examples = %d", len(eval_examples)) + tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_drop_remainder = False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + batch_size=FLAGS.eval_batch_size, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] + eval_start_time = time.time() + result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks) + + eval_time_elapsed = time.time() - eval_start_time + + time_list = eval_hooks[-1].time_list + time_list.sort() + # Removing outliers (init/warmup) in throughput computation. + eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)]) + num_sentences = (int(len(time_list) * 0.8)) * FLAGS.eval_batch_size + + avg = np.mean(time_list) + cf_50 = max(time_list[:int(len(time_list) * 0.50)]) + cf_90 = max(time_list[:int(len(time_list) * 0.90)]) + cf_95 = max(time_list[:int(len(time_list) * 0.95)]) + cf_99 = max(time_list[:int(len(time_list) * 0.99)]) + cf_100 = max(time_list[:int(len(time_list) * 1)]) + ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead + + tf.compat.v1.logging.info("-----------------------------") + tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, + eval_hooks[-1].count * FLAGS.eval_batch_size) + tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, + num_sentences) + tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") + tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) + tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) + tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") + tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) + tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) + tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) + tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) + tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) + tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) + tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) + dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) + tf.compat.v1.logging.info("-----------------------------") + + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.io.gfile.GFile(output_eval_file, "w") as writer: + tf.compat.v1.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + dllogging.logger.log(step=(), data={key: float(result[key])}, verbosity=Verbosity.DEFAULT) + tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict and master_process: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.compat.v1.logging.info("***** Running prediction*****") + tf.compat.v1.logging.info(" Num examples = %d", len(predict_examples)) + tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + batch_size=FLAGS.predict_batch_size, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] + predict_start_time = time.time() + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.io.gfile.GFile(output_predict_file, "w") as writer: + tf.compat.v1.logging.info("***** Predict results *****") + for prediction in estimator.predict(input_fn=predict_input_fn, hooks=predict_hooks, + yield_single_examples=False): + output_line = "\t".join( + str(class_probability) for class_probability in prediction) + "\n" + writer.write(output_line) + + + predict_time_elapsed = time.time() - predict_start_time + + time_list = predict_hooks[-1].time_list + time_list.sort() + # Removing outliers (init/warmup) in throughput computation. + predict_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)]) + num_sentences = (int(len(time_list) * 0.8)) * FLAGS.predict_batch_size + + avg = np.mean(time_list) + cf_50 = max(time_list[:int(len(time_list) * 0.50)]) + cf_90 = max(time_list[:int(len(time_list) * 0.90)]) + cf_95 = max(time_list[:int(len(time_list) * 0.95)]) + cf_99 = max(time_list[:int(len(time_list) * 0.99)]) + cf_100 = max(time_list[:int(len(time_list) * 1)]) + ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead + + tf.compat.v1.logging.info("-----------------------------") + tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", predict_time_elapsed, + predict_hooks[-1].count * FLAGS.predict_batch_size) + tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", predict_time_wo_overhead, + num_sentences) + tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET") + tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) + tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) + tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") + tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) + tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) + tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) + tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) + tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) + tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) + tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) + dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) + tf.compat.v1.logging.info("-----------------------------") + + +if __name__ == "__main__": + if FLAGS.horovod: + (npu_sess, npu_shutdown) = init_resource() + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.compat.v1.app.run() + if FLAGS.horovod: + shutdown_resource(npu_sess, npu_shutdown) + close_session(npu_sess) + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/utils.py b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/utils.py index ef32a31cfd754ed93e563291d149822e281aa2ab..898b6706958781aecdd532f461925d03128ec82c 100644 --- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/utils.py +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/utils.py @@ -79,7 +79,7 @@ class LogTrainRunHook(tf.estimator.SessionRunHook): def before_run(self, run_context): self.t0 = time.time() return tf.estimator.SessionRunArgs( - fetches=['step_update:0']) + fetches=['global_step:0']) def after_run(self, run_context, run_values): elapsed_secs = time.time() - self.t0 diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_full_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..013ec5c4f76ac6d9b229fa80a86b8ad92e36ec48 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_full_1p.sh @@ -0,0 +1,240 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1641_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MRPC +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_full_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..616ab8c4e90033c598450b4315f0d7aa45c1f2f4 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_full_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1641_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MRPC +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_performance_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..d62376f98ed59e2acb3ea1c9a673c69e5c417c28 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_performance_1p.sh @@ -0,0 +1,239 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1641_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MRPC +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_performance_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..cd82237c5bedb518fb8b6bbb9289563cb8e6c438 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1641_BertLarge-128_performance_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1641_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MRPC +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_full_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..821fc133592fc46fa31adf2e9c9503ee4231227c --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_full_1p.sh @@ -0,0 +1,240 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1642_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_full_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..c08598833696d70713bd18457b2e9260ee9c1ef4 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_full_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1642_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_performance_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..73235508dac576151810b8805dcae86cef97468d --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_performance_1p.sh @@ -0,0 +1,239 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1642_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_performance_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..784348c1cd18fb54f241cd32e992a0acc89991d1 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1642_BertLarge-128_performance_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1642_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_full_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..d71d92c6134cecc69bad308be2746090a3b1a560 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_full_1p.sh @@ -0,0 +1,240 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1643_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=CoLA +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_full_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..dd001a79243ac61641dc18d4340554c25ab46f65 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_full_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1643_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=CoLA +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_performance_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..710a426ba66f591223a8405ad3cd4a0ba2ebd22a --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_performance_1p.sh @@ -0,0 +1,239 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1643_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=CoLA +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_performance_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..9303a7c4a00c28aee8a4b7a1ad889b6ef37ce99c --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID1643_BertLarge-128_performance_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertLarge-128_ID1643_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=CoLA +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-24_H-1024_A-16 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_full_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..69dc1aba451917ea3ae843de9d3484aff1c64bae --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_full_1p.sh @@ -0,0 +1,240 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3232_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MRPC +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_full_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..b309a003ebc374f89e28866b0781a4567f600aca --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_full_8p.sh @@ -0,0 +1,250 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3232_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MRPC +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_performance_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..70ee468f6e9e680aa568764740ff0ea370e381d2 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_performance_1p.sh @@ -0,0 +1,239 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3232_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MRPC +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_performance_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..b2bb34a7af5e754357c2c37d5db341366c87412c --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3232_BertBase-128_performance_8p.sh @@ -0,0 +1,250 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3232_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MRPC +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_full_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..aed8fc1c25efb480b090e710e75df24d82c53bd9 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_full_1p.sh @@ -0,0 +1,240 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3233_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_full_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..b7d9d29d9964c8b86b06a8cfd62cb4538553e3dd --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_full_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3233_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_performance_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..870acc19075a6afacac18489910c74a05709adbc --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_performance_1p.sh @@ -0,0 +1,239 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3233_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_performance_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..634c486d8e56b4d47478f059253c073bb017a4e6 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3233_BertBase-128_performance_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3233_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_full_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..d35ca1ca2aed61e3daddcae8f198e55a2a043471 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_full_1p.sh @@ -0,0 +1,240 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3234_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_full_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..429e152327fae3c0a4486c8fa23e5eecaae06532 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_full_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3234_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=3.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_performance_1p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..334ad376a576dd18d1132e786499ea4f0114e909 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_performance_1p.sh @@ -0,0 +1,239 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3234_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_performance_8p.sh b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..148d0663aa68325a43c7b653e58f8bcf2b53f8b4 --- /dev/null +++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/test/train_ID3234_BertBase-128_performance_8p.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 +export RANK_TABLE_FILE=#cur_path/../configs/${RANK_SIZE}p.json +export JOB_ID=10087 +export GE_USE_STATIC_MEMORY=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="BertBase-128_ID3234_for_Tensorflow" +#训练batch_size +train_batch_size=32 +#训练ephch +num_train_epochs=1.0 +#学习率 +learning_rate=1e-6 +warmup_proportion=0.1 +precision="fp32" +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +optimizer_type="adam" +#维持参数,以下不需要修改 +over_dump=False +over_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/overflow_dump +data_dump_flag=False +data_dump_path=${cur_path}/output/${ASCEND_DEVICE_ID}/data_dump +enable_exception_dump=False +data_dump_step="0" +profiling=False +autotune=False + +#其他参数 +task_name=MNLI +output_dir=ckpt +type=official +use_xla=false +use_fp16="" +if [ "$precision" = "fp16" ] ; then + echo "fp16 activated!" + use_fp16="--amp" +else + echo "fp32/tf32 activated!" + use_fp16="--noamp" +fi + + +if [ "$use_xla" = "true" ] ; then + use_xla_tag="--use_xla" + echo "XLA activated" +else + use_xla_tag="--nouse_xla" +fi + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --task_name* ]];then + task_name=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --train_batch_size* ]];then + train_batch_size=`echo ${para#*=}` + elif [[ $para == --learning_rate* ]];then + learning_rate=`echo ${para#*=}` + elif [[ $para == --num_train_epochs* ]];then + num_train_epochs=`echo ${para#*=}` + elif [[ $para == --output_dir* ]];then + output_dir=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --optimizer_type* ]];then + optimizer_type=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + elif [[ $para == --data_dump_path* ]];then + data_dump_path=`echo ${para#*=}` + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --enable_exception_dump* ]];then + enable_exception_dump=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +bertmodelpath=$ckpt_path/uncased_L-12_H-768_A-12 +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=${ASCEND_DEVICE_ID} + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${data_dump_path} + mkdir -p ${over_dump_path} + fi + + # 绑核,不需要的绑核的模型删除,需要的模型审视修改 + let a=RANK_ID*12 + let b=RANK_ID+1 + let c=b*12-1 + + corenum=`cat /proc/cpuinfo |grep 'processor' | wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core" != x ];then + bind_core="taskset -c $a-$c" + fi + + nohup ${bind_core} python3 ./src/run_classifier.py \ + --task_name=$task_name \ + --do_train=true \ + --do_eval=true \ + --enable_exception_dump=$enable_exception_dump\ + --data_dump_flag=$data_dump_flag \ + --data_dump_step=$data_dump_step\ + --data_dump_path=$data_dump_path\ + --over_dump=$over_dump \ + --over_dump_path=$over_dump_path \ + --precision_mode=$precision_mode \ + --data_dir=${data_path}/Glue/${task_name} \ + --vocab_file=$bertmodelpath/vocab.txt \ + --bert_config_file=$bertmodelpath/bert_config.json \ + --init_checkpoint=$bertmodelpath/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=$train_batch_size \ + --learning_rate=$learning_rate \ + --num_train_epochs=$num_train_epochs \ + --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/${output_dir} \ + --horovod=false "$use_fp16" \ + --distributed=False \ + --npu_bert_loss_scale=0 \ + --optimizer_type= $optimizer_type \ + $use_xla_tag --warmup_proportion=$warmup_proportion > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_sec=`grep -a 'INFO:tensorflow:global_step/sec: ' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +FPS=`awk 'BEGIN{printf "%d\n",'$step_sec' * '$train_batch_size'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'eval_accuracy' ${cur_path}/${output_dir}/eval_results.txt|awk '{print $3}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${train_batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep ' basic_session_run_hooks.py:260] loss =' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'loss = ' '{print $2}'|awk -F ', ' '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "train_accuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + + + + +