diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/imagenet_preprocessing_HW192.py b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/imagenet_preprocessing_HW192.py new file mode 100644 index 0000000000000000000000000000000000000000..16294a2c94e87b07f9cf03714af5b52e3180cc96 --- /dev/null +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/imagenet_preprocessing_HW192.py @@ -0,0 +1,629 @@ +#!/usr/bin/env python +# coding=utf-8 + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Provides utilities to preprocess images. + +Training images are sampled using the provided bounding boxes, and subsequently +cropped to the sampled bounding box. Images are additionally flipped randomly, +then resized to the target output size (without aspect-ratio preservation). + +Images used during evaluation are resized (with aspect-ratio preservation) and +centrally cropped. + +All images undergo mean color subtraction. + +Note that these steps are colloquially referred to as "ResNet preprocessing," +and they differ from "VGG preprocessing," which does not use bounding boxes +and instead does an aspect-preserving resize followed by random crop during +training. (These both differ from "Inception preprocessing," which introduces +color distortion steps.) + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os +from absl import flags +from absl import logging +import tensorflow as tf + +DEFAULT_IMAGE_SIZE = 192 +EVAL_IMAGE_SIZE = 224 +NUM_CHANNELS = 3 + +NUM_IMAGES = { + 'train': 1281167, + 'validation': 50000, +} + +_NUM_TRAIN_FILES = 1024 +_SHUFFLE_BUFFER = 10000 + +_R_MEAN = 123.68 +_G_MEAN = 116.78 +_B_MEAN = 103.94 +CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN] + +# The lower bound for the smallest side of the image for aspect-preserving +# resizing. For example, if an image is 500 x 1000, it will be resized to +# _RESIZE_MIN x (_RESIZE_MIN * 2). +_RESIZE_MIN = 256 + +FLAGS = flags.FLAGS + + +def process_record_dataset(dataset, + is_training, + batch_size, + shuffle_buffer, + dtype=tf.float32, + datasets_num_private_threads=None, + drop_remainder=False, + tf_data_experimental_slack=False, + prefetch_batchs=tf.data.experimental.AUTOTUNE): + """Given a Dataset with raw records, return an iterator over the records. + + Args: + dataset: A Dataset representing raw records + is_training: A boolean denoting whether the input is for training. + batch_size: The number of samples per batch. + shuffle_buffer: The buffer size to use when shuffling records. A larger + value results in better randomness, but smaller values reduce startup + time and use less memory. + dtype: Data type to use for images/features. + datasets_num_private_threads: Number of threads for a private + threadpool created for all datasets computation. + drop_remainder: A boolean indicates whether to drop the remainder of the + batches. If True, the batch dimension will be static. + tf_data_experimental_slack: Whether to enable tf.data's + `experimental_slack` option. + prefetch_batchs: The number of batchs to prefetch. + + Returns: + Dataset of (image, label) pairs ready for iteration. + """ + # Defines a specific size thread pool for tf.data operations. + if datasets_num_private_threads: + options = tf.data.Options() + options.experimental_threading.private_threadpool_size = ( + datasets_num_private_threads) + dataset = dataset.with_options(options) + logging.info( + 'datasets_num_private_threads: %s', datasets_num_private_threads) + + if is_training: + # Shuffles records before repeating to respect epoch boundaries. + dataset = dataset.shuffle(buffer_size=shuffle_buffer) + # Repeats the dataset for the number of epochs to train. + dataset = dataset.repeat() + + one_hot = False + num_classes = FLAGS.num_classes + if FLAGS.label_smoothing and FLAGS.label_smoothing > 0: + one_hot = True + + logging.info('Num classes: %d', num_classes) + logging.info('One hot: %s', one_hot) + if is_training and FLAGS.cache_decoded_image: + parse_record_fn = preprocess_parsed_example + else: + parse_record_fn = parse_and_preprocess_record + + map_fn = functools.partial( + parse_record_fn, + is_training=is_training, + dtype=dtype, + num_classes=num_classes, + one_hot=one_hot) + + # Parses the raw records into images and labels. + #dataset = dataset.map( + # map_fn, + # num_parallel_calls=tf.data.experimental.AUTOTUNE) + dataset = dataset.map( + map_fn, + num_parallel_calls=12) + dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) + + # Operations between the final prefetch and the get_next call to the iterator + # will happen synchronously during run time. We prefetch here again to + # background all of the above processing work and keep it out of the + # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE + # allows DistributionStrategies to adjust how many batches to fetch based + # on how many devices are present. + dataset = dataset.prefetch(buffer_size=prefetch_batchs) + + options = tf.data.Options() + options.experimental_slack = tf_data_experimental_slack + dataset = dataset.with_options(options) + + return dataset + + +def get_filenames(is_training, data_dir): + """Return filenames for dataset.""" + if is_training: + return [ + os.path.join(data_dir, 'train-%05d-of-01024' % i) + for i in range(_NUM_TRAIN_FILES)] + else: + return [ + os.path.join(data_dir, 'validation-%05d-of-00128' % i) + for i in range(128)] + + +def parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + + The output of the build_image_data.py image preprocessing script is a dataset + containing serialized Example protocol buffers. Each Example proto contains + the following fields (values are included as examples): + + image/height: 462 + image/width: 581 + image/colorspace: 'RGB' + image/channels: 3 + image/class/label: 615 + image/class/synset: 'n03623198' + image/class/text: 'knee pad' + image/object/bbox/xmin: 0.1 + image/object/bbox/xmax: 0.9 + image/object/bbox/ymin: 0.2 + image/object/bbox/ymax: 0.6 + image/object/bbox/label: 615 + image/format: 'JPEG' + image/filename: 'ILSVRC2012_val_00041207.JPEG' + image/encoded: + + Args: + example_serialized: scalar Tensor tf.string containing a serialized + Example protocol buffer. + + Returns: + image_buffer: Tensor tf.string containing the contents of a JPEG file. + label: Tensor tf.int32 containing the label. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + """ + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, + default_value=-1), + 'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, + default_value=''), + } + sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + {k: sparse_float32 for k in [ + 'image/object/bbox/xmin', 'image/object/bbox/ymin', + 'image/object/bbox/xmax', 'image/object/bbox/ymax']}) + + features = tf.io.parse_single_example(serialized=example_serialized, + features=feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + + xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) + ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) + xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) + ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) + + # Note that we impose an ordering of (y, x) just to make life difficult. + bbox = tf.concat([ymin, xmin, ymax, xmax], 0) + + # Force the variable number of bounding boxes into the shape + # [1, num_boxes, coords]. + bbox = tf.expand_dims(bbox, 0) + bbox = tf.transpose(a=bbox, perm=[0, 2, 1]) + + return features['image/encoded'], label, bbox + + +def parse_example_proto_and_decode(example_serialized): + """Parses an example and decodes the image to prepare for caching.""" + image_buffer, label, bbox = parse_example_proto(example_serialized) + image_buffer = tf.reshape(image_buffer, shape=[]) + image_buffer = tf.io.decode_jpeg(image_buffer, 3) + return image_buffer, label, bbox + + +def preprocess_parsed_example( + image_buffer, label, bbox, is_training, dtype, num_classes, one_hot=False): + """Applies preprocessing steps to the input parsed example.""" + image = preprocess_image( + image_buffer=image_buffer, + bbox=bbox, + output_height=DEFAULT_IMAGE_SIZE, + output_width=DEFAULT_IMAGE_SIZE, + num_channels=NUM_CHANNELS, + is_training=is_training) + image = tf.cast(image, dtype) + + # Subtract one so that labels are in [0, 1000), and cast to float32 for + # Keras model. + label = tf.reshape(label, shape=[1]) + label = tf.cast(label, tf.int32) + label -= 1 + + if one_hot: + label = tf.one_hot(label, num_classes) + label = tf.reshape(label, [num_classes]) + else: + label = tf.cast(label, tf.float32) + + return image, label + + +def parse_and_preprocess_record( + raw_record, is_training, dtype, num_classes, one_hot=False): + """Parses and preprocesses a record containing a training example of an image. + + The input record is parsed into a label and image, and the image is passed + through preprocessing steps (cropping, flipping, and so on). + + Args: + raw_record: scalar Tensor tf.string containing a serialized + Example protocol buffer. + is_training: A boolean denoting whether the input is for training. + dtype: data type to use for images/features. + num_classes: Number of classes for one hot encoding. + one_hot: Whether to use one_hot encoding on label. + + Returns: + Tuple with processed image tensor in a channel-last format and + one-hot-encoded label tensor. + """ + image_buffer, label, bbox = parse_example_proto(raw_record) + return preprocess_parsed_example(image_buffer=image_buffer, + label=label, + bbox=bbox, + is_training=is_training, + dtype=dtype, + one_hot=one_hot, + num_classes=num_classes) + + +def input_fn(is_training, + data_dir, + batch_size, + dtype=tf.float32, + datasets_num_private_threads=None, + input_context=None, + drop_remainder=False, + tf_data_experimental_slack=False, + dataset_cache=False, + filenames=None, + prefetch_batchs=tf.data.experimental.AUTOTUNE): + """Input function which provides batches for train or eval. + + Args: + is_training: A boolean denoting whether the input is for training. + data_dir: The directory containing the input data. + batch_size: The number of samples per batch. + dtype: Data type to use for images/features + datasets_num_private_threads: Number of private threads for tf.data. + input_context: A `tf.distribute.InputContext` object passed in by + `tf.distribute.Strategy`. + drop_remainder: A boolean indicates whether to drop the remainder of the + batches. If True, the batch dimension will be static. + tf_data_experimental_slack: Whether to enable tf.data's + `experimental_slack` option. + dataset_cache: Whether to cache the dataset on workers. + Typically used to improve training performance when training data is in + remote storage and can fit into worker memory. + filenames: Optional field for providing the file names of the TFRecords. + prefetch_batchs: The number of batchs to prefetch. + + Returns: + A dataset that can be used for iteration. + """ + if filenames is None: + filenames = get_filenames(is_training, data_dir) + dataset = tf.data.Dataset.from_tensor_slices(filenames) + + import npu_device as npu + dataset, batch_size = npu.distribute.shard_and_rebatch_dataset(dataset, batch_size) + #if input_context: + # logging.info( + # 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d', + # input_context.input_pipeline_id, input_context.num_input_pipelines) + # dataset = dataset.shard(input_context.num_input_pipelines, + # input_context.input_pipeline_id) + + if is_training: + # Shuffle the input files + dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) + + # Convert to individual records. + # cycle_length = 10 means that up to 10 files will be read and deserialized in + # parallel. You may want to increase this number if you have a large number of + # CPU cores. + dataset = dataset.interleave( + tf.data.TFRecordDataset, + cycle_length=10, + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + if is_training and FLAGS.cache_decoded_image: + dataset = dataset.map( + parse_example_proto_and_decode, + num_parallel_calls=tf.data.experimental.AUTOTUNE) + if dataset_cache: + # Improve training / eval performance when data is in remote storage and + # can fit into worker memory. + dataset = dataset.cache() + + return process_record_dataset( + dataset=dataset, + is_training=is_training, + batch_size=batch_size, + shuffle_buffer=_SHUFFLE_BUFFER, + dtype=dtype, + datasets_num_private_threads=datasets_num_private_threads, + drop_remainder=drop_remainder, + tf_data_experimental_slack=tf_data_experimental_slack, + prefetch_batchs=prefetch_batchs, + ) + + +def _decode_crop_and_flip(image_buffer, bbox, num_channels): + """Crops the given image to a random part of the image, and randomly flips. + + We use the fused decode_and_crop op, which performs better than the two ops + used separately in series, but note that this requires that the image be + passed in as an un-decoded string Tensor. + + Args: + image_buffer: scalar string Tensor representing the raw JPEG image buffer. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + num_channels: Integer depth of the image buffer for decoding. + + Returns: + 3-D tensor with cropped image. + + """ + # A large fraction of image datasets contain a human-annotated bounding box + # delineating the region of the image containing the object of interest. We + # choose to create a new bounding box for the object which is a randomly + # distorted version of the human-annotated bounding box that obeys an + # allowed range of aspect ratios, sizes and overlap with the human-annotated + # bounding box. If no box is supplied, then we assume the bounding box is + # the entire image. + decoded = image_buffer.dtype != tf.string + shape = (tf.shape(image_buffer) if decoded + else tf.image.extract_jpeg_shape(image_buffer)) + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + shape, + bounding_boxes=bbox, + min_object_covered=0.1, + aspect_ratio_range=[0.75, 1.33], + area_range=[0.05, 1.0], + max_attempts=100, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, _ = sample_distorted_bounding_box + + # Reassemble the bounding box in the format the crop op requires. + offset_y, offset_x, _ = tf.unstack(bbox_begin) + target_height, target_width, _ = tf.unstack(bbox_size) + crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) + + if decoded: + cropped = tf.image.crop_to_bounding_box( + image_buffer, + offset_height=offset_y, + offset_width=offset_x, + target_height=target_height, + target_width=target_width) + else: + # Use the fused decode and crop op here, which is faster than sequential. + cropped = tf.image.decode_and_crop_jpeg( + image_buffer, crop_window, channels=num_channels) + + # Flip to add a little more random distortion in. + cropped = tf.image.random_flip_left_right(cropped) + return cropped + + +def _central_crop(image, crop_height, crop_width): + """Performs central crops of the given image list. + + Args: + image: a 3-D image tensor + crop_height: the height of the image following the crop. + crop_width: the width of the image following the crop. + + Returns: + 3-D tensor with cropped image. + """ + shape = tf.shape(input=image) + height, width = shape[0], shape[1] + + amount_to_be_cropped_h = (height - crop_height) + crop_top = amount_to_be_cropped_h // 2 + amount_to_be_cropped_w = (width - crop_width) + crop_left = amount_to_be_cropped_w // 2 + return tf.slice( + image, [crop_top, crop_left, 0], [crop_height, crop_width, -1]) + + +def _mean_image_subtraction(image, means, num_channels): + """Subtracts the given means from each image channel. + + For example: + means = [123.68, 116.779, 103.939] + image = _mean_image_subtraction(image, means) + + Note that the rank of `image` must be known. + + Args: + image: a tensor of size [height, width, C]. + means: a C-vector of values to subtract from each channel. + num_channels: number of color channels in the image that will be distorted. + + Returns: + the centered image. + + Raises: + ValueError: If the rank of `image` is unknown, if `image` has a rank other + than three or if the number of channels in `image` doesn't match the + number of values in `means`. + """ + if image.get_shape().ndims != 3: + raise ValueError('Input must be of size [height, width, C>0]') + + if len(means) != num_channels: + raise ValueError('len(means) must match the number of channels') + + # We have a 1-D tensor of means; convert to 3-D. + # Note(b/130245863): we explicitly call `broadcast` instead of simply + # expanding dimensions for better performance. + means = tf.broadcast_to(means, tf.shape(image)) + + return image - means + + +def _smallest_size_at_least(height, width, resize_min): + """Computes new shape with the smallest side equal to `smallest_side`. + + Computes new shape with the smallest side equal to `smallest_side` while + preserving the original aspect ratio. + + Args: + height: an int32 scalar tensor indicating the current height. + width: an int32 scalar tensor indicating the current width. + resize_min: A python integer or scalar `Tensor` indicating the size of + the smallest side after resize. + + Returns: + new_height: an int32 scalar tensor indicating the new height. + new_width: an int32 scalar tensor indicating the new width. + """ + resize_min = tf.cast(resize_min, tf.float32) + + # Convert to floats to make subsequent calculations go smoothly. + height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32) + + smaller_dim = tf.minimum(height, width) + scale_ratio = resize_min / smaller_dim + + # Convert back to ints to make heights and widths that TF ops will accept. + new_height = tf.cast(height * scale_ratio, tf.int32) + new_width = tf.cast(width * scale_ratio, tf.int32) + + return new_height, new_width + + +def _aspect_preserving_resize(image, resize_min): + """Resize images preserving the original aspect ratio. + + Args: + image: A 3-D image `Tensor`. + resize_min: A python integer or scalar `Tensor` indicating the size of + the smallest side after resize. + + Returns: + resized_image: A 3-D tensor containing the resized image. + """ + shape = tf.shape(input=image) + height, width = shape[0], shape[1] + + new_height, new_width = _smallest_size_at_least(height, width, resize_min) + + return _resize_image(image, new_height, new_width) + + +def _resize_image(image, height, width): + """Simple wrapper around tf.resize_images. + + This is primarily to make sure we use the same `ResizeMethod` and other + details each time. + + Args: + image: A 3-D image `Tensor`. + height: The target height for the resized image. + width: The target width for the resized image. + + Returns: + resized_image: A 3-D tensor containing the resized image. The first two + dimensions have the shape [height, width]. + """ + return tf.compat.v1.image.resize( + image, [height, width], method=tf.image.ResizeMethod.BILINEAR, + align_corners=False) + + +def preprocess_image(image_buffer, bbox, output_height, output_width, + num_channels, is_training=False): + """Preprocesses the given image. + + Preprocessing includes decoding, cropping, and resizing for both training + and eval images. Training preprocessing, however, introduces some random + distortion of the image to improve accuracy. + + Args: + image_buffer: scalar string Tensor representing the raw JPEG image buffer. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + output_height: The height of the image after preprocessing. + output_width: The width of the image after preprocessing. + num_channels: Integer depth of the image buffer for decoding. + is_training: `True` if we're preprocessing the image for training and + `False` otherwise. + + Returns: + A preprocessed image. + """ + if is_training: + # For training, we want to randomize some of the distortions. + image = _decode_crop_and_flip(image_buffer, bbox, num_channels) + image = _resize_image(image, output_height, output_width) + else: + # For validation, we want to decode, resize, then just crop the middle. + if image_buffer.dtype == tf.string: + image = tf.image.decode_jpeg(image_buffer, channels=num_channels) + else: + image = image_buffer + output_height = EVAL_IMAGE_SIZE + output_width = EVAL_IMAGE_SIZE + image = _aspect_preserving_resize(image, _RESIZE_MIN) + image = _central_crop(image, output_height, output_width) + + image.set_shape([output_height, output_width, num_channels]) + + return _mean_image_subtraction(image, CHANNEL_MEANS, num_channels) \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_ctl_imagenet_main_HW192.py b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_ctl_imagenet_main_HW192.py new file mode 100644 index 0000000000000000000000000000000000000000..077b2c3ead2e862d1049dd5229eae712b60d0dd3 --- /dev/null +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_ctl_imagenet_main_HW192.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python +# coding=utf-8 + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Runs a ResNet model on the ImageNet dataset using custom training loops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import app +from absl import flags +from absl import logging +import tensorflow as tf + +from tf2_common.modeling import performance +from tf2_common.training import controller +from tf2_common.utils.flags import core as flags_core +from tf2_common.utils.logs import logger +from tf2_common.utils.misc import distribution_utils +from tf2_common.utils.misc import keras_utils +from tf2_common.utils.misc import model_helpers +from tf2_common.utils.mlp_log import mlp_log +import common +import imagenet_preprocessing_HW192 +import resnet_runnable_HW192 +import json +import npu_device + + +flags.DEFINE_boolean(name='use_tf_function', default=True, + help='Wrap the train and test step inside a ' + 'tf.function.') +flags.DEFINE_boolean(name='single_l2_loss_op', default=False, + help='Calculate L2_loss on concatenated weights, ' + 'instead of using Keras per-layer L2 loss.') +flags.DEFINE_boolean(name='cache_decoded_image', default=False, + help='Whether or not to cache decoded images in the ' + 'input pipeline. If this flag and `cache` is enabled, ' + 'then TFExample protos will be parsed and then cached ' + 'which reduces the load on hosts.') +flags.DEFINE_boolean(name='enable_device_warmup', default=False, + help='Whether or not to enable device warmup. This ' + 'includes training on dummy data and enabling graph/XLA ' + 'compilation before run_start.') +flags.DEFINE_integer(name='device_warmup_steps', default=1, + help='The number of steps to apply for device warmup.') +flags.DEFINE_integer(name='num_replicas', default=32, + help='The number of TPU cores to use, ' + 'for log printout only.') + +flags.DEFINE_string(name='precision_mode', default= 'allow_mix_precision', + help='allow_fp32_to_fp16/force_fp16/ ' + 'must_keep_origin_dtype/allow_mix_precision.') +flags.DEFINE_boolean(name='over_dump', default=False, + help='if or not over detection, default is False') +flags.DEFINE_boolean(name='data_dump_flag', default=False, + help='data dump flag, default is False') +flags.DEFINE_string(name='data_dump_step', default="10", + help='data dump step, default is 10') +flags.DEFINE_boolean(name='profiling', default=False, + help='if or not profiling for performance debug, default is False') +flags.DEFINE_string(name='profiling_dump_path', default="/home/data", + help='the path to save profiling data') +flags.DEFINE_string(name='over_dump_path', default="/home/data", + help='the path to save over dump data') +flags.DEFINE_string(name='data_dump_path', default="/home/data", + help='the path to save dump data') +flags.DEFINE_boolean(name='autotune', default=False, + help='whether to enable autotune, default is False') + +def npu_config(): + FLAGS = flags.FLAGS + npu_config = {} + + if FLAGS.data_dump_flag: + npu_device.global_options().dump_config.enable_dump = True + npu_device.global_options().dump_config.dump_path = FLAGS.data_dump_path + npu_device.global_options().dump_config.dump_step = FLAGS.data_dump_step + npu_device.global_options().dump_config.dump_mode = "all" + + if FLAGS.over_dump: + npu_device.global_options().dump_config.enable_dump_debug = True + npu_device.global_options().dump_config.dump_path = FLAGS.over_dump_path + npu_device.global_options().dump_config.dump_debug_mode = "all" + + if FLAGS.profiling: + npu_device.global_options().profiling_config.enable_profiling = True + profiling_options = '{"output":"' + FLAGS.profiling_dump_path + '", \ + "training_trace":"on", \ + "task_trace":"on", \ + "aicpu":"on", \ + "fp_point":"while1While_body_while_body_10958_1/while/resnet50/conv1/Conv2Dwhile1While_body_while_body_10958_1/while/resnet50/bn_conv1/FusedBatchNormV3_Reduce", \ + "bp_point":"while1While_body_while_body_10958_1/gradient_tape/while/resnet50/bn5c_branch2c/FusedBatchNormGradV3_Update"}' + npu_device.global_options().profiling_config.profiling_options = profiling_options + npu_device.global_options().precision_mode=FLAGS.precision_mode + npu_device.open().as_default() + + +def build_stats(runnable, time_callback): + """Normalizes and returns dictionary of stats. + + Args: + runnable: The module containing all the training and evaluation metrics. + time_callback: Time tracking callback instance. + + Returns: + Dictionary of normalized results. + """ + stats = {} + + if not runnable.flags_obj.skip_eval: + if runnable.test_loss: + stats['eval_loss'] = runnable.test_loss.result().numpy() + if runnable.test_accuracy: + stats['eval_acc'] = runnable.test_accuracy.result().numpy() + + if runnable.train_loss: + stats['train_loss'] = runnable.train_loss.result().numpy() + if runnable.train_accuracy: + stats['train_acc'] = runnable.train_accuracy.result().numpy() + + if time_callback: + timestamp_log = time_callback.timestamp_log + stats['step_timestamp_log'] = timestamp_log + stats['train_finish_time'] = time_callback.train_finish_time + if time_callback.epoch_runtime_log: + stats['avg_exp_per_second'] = time_callback.average_examples_per_second + + return stats + + +def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop): + """Calculates steps to run on device.""" + if steps_per_loop <= 0: + raise ValueError('steps_per_loop should be positive integer.') + if steps_per_loop == 1: + return steps_per_loop + return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch) + + +def run(flags_obj): + """Run ResNet ImageNet training and eval loop using custom training loops. + + Args: + flags_obj: An object containing parsed flag values. + + Raises: + ValueError: If fp16 is passed as it is not currently supported. + + Returns: + Dictionary of training and eval stats. + """ + mlp_log.mlperf_print('cache_clear', True) + mlp_log.mlperf_print('init_start', None) + mlp_log.mlperf_print('submission_benchmark', 'resnet') + mlp_log.mlperf_print('submission_division', 'closed') + mlp_log.mlperf_print('submission_org', 'google') + mlp_log.mlperf_print( + 'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas) + if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus)) + mlp_log.mlperf_print('submission_status', 'cloud') + + npu_config() + + common.print_flags(flags_obj) + + keras_utils.set_session_config( + enable_eager=flags_obj.enable_eager, + enable_xla=flags_obj.enable_xla) + performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj)) + + if tf.config.list_physical_devices('GPU'): + if flags_obj.tf_gpu_thread_mode: + datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count( + per_gpu_thread_count=flags_obj.per_gpu_thread_count, + gpu_thread_mode=flags_obj.tf_gpu_thread_mode, + num_gpus=flags_obj.num_gpus) + if not flags_obj.datasets_num_private_threads: + flags_obj.datasets_num_private_threads = datasets_num_private_threads + common.set_cudnn_batchnorm_mode() + + # TODO(anj-s): Set data_format without using Keras. + data_format = flags_obj.data_format + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + tf.keras.backend.set_image_data_format(data_format) + + strategy = distribution_utils.get_distribution_strategy( + distribution_strategy=flags_obj.distribution_strategy, + num_gpus=flags_obj.num_gpus, + all_reduce_alg=flags_obj.all_reduce_alg, + num_packs=flags_obj.num_packs, + tpu_address=flags_obj.tpu, + tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None) + mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size) + mlp_log.mlperf_print('train_samples', + imagenet_preprocessing_HW192.NUM_IMAGES['train']) + mlp_log.mlperf_print('eval_samples', + imagenet_preprocessing_HW192.NUM_IMAGES['validation']) + mlp_log.mlperf_print( + 'model_bn_span', + int(flags_obj.batch_size / + (flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus))) + + per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj) + eval_steps = common.get_num_eval_steps(flags_obj) + steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps) + + logging.info( + 'Training %d epochs, each epoch has %d steps, ' + 'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps, + train_epochs * per_epoch_steps, eval_steps) + + time_callback = keras_utils.TimeHistory( + flags_obj.batch_size, + flags_obj.log_steps, + logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None) + with distribution_utils.get_strategy_scope(strategy): + runnable = resnet_runnable_HW192.ResnetRunnable(flags_obj, time_callback) + + eval_interval = ( + flags_obj.epochs_between_evals * + per_epoch_steps if not flags_obj.skip_eval else None) + eval_offset = ( + flags_obj.eval_offset_epochs * + per_epoch_steps if not flags_obj.skip_eval else 0) + if eval_offset != 0: + eval_offset -= eval_interval + checkpoint_interval = ( + per_epoch_steps if flags_obj.enable_checkpoint_and_export else None) + summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None + + checkpoint_manager = tf.train.CheckpointManager( + runnable.checkpoint, + directory=flags_obj.model_dir, + max_to_keep=10, + step_counter=runnable.global_step, + checkpoint_interval=checkpoint_interval) + + device_warmup_steps = (flags_obj.device_warmup_steps + if flags_obj.enable_device_warmup else 0) + if flags_obj.enable_device_warmup: + logging.info('Warmup for %d steps.', device_warmup_steps) + + resnet_controller = controller.Controller( + strategy, + runnable.train, + runnable.evaluate, + runnable.warmup, + global_step=runnable.global_step, + steps_per_loop=steps_per_loop, + train_steps=per_epoch_steps * train_epochs, + device_warmup_steps=device_warmup_steps, + checkpoint_manager=checkpoint_manager, + summary_interval=summary_interval, + eval_steps=eval_steps, + eval_interval=eval_interval, + eval_offset=eval_offset) + + if flags_obj.enable_device_warmup: + resnet_controller.warmup() + + mlp_log.mlperf_print('init_stop', None) + + profile_steps = flags_obj.profile_steps + if profile_steps: + profile_steps = [int(i) for i in profile_steps.split(',')] + if profile_steps[0] < 0: + runnable.trace_start(-1) + + time_callback.on_train_begin() + mlp_log.mlperf_print('run_start', None) + mlp_log.mlperf_print( + 'block_start', + None, + metadata={ + 'first_epoch_num': + 1, + 'epoch_count': + (flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0 + else flags_obj.epochs_between_evals) + }) + resnet_controller.train(evaluate=not flags_obj.skip_eval) + mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'}) + time_callback.on_train_end() + mlp_log.mlperf_print('run_final', None) + + stats = build_stats(runnable, time_callback) + return stats + + +def define_imagenet_keras_flags(): + common.define_keras_flags() + flags_core.set_defaults() + flags.adopt_module_key_flags(common) + + +def main(_): + model_helpers.apply_clean(flags.FLAGS) + with logger.benchmark_context(flags.FLAGS): + stats = run(flags.FLAGS) + logging.info('Run stats:\n%s', stats) + + +if __name__ == '__main__': + logging.set_verbosity(logging.INFO) + common.define_keras_flags() + app.run(main) \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_model_HW192.py b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_model_HW192.py new file mode 100644 index 0000000000000000000000000000000000000000..04821d5a80aa14af3e0165fe2b8c8fd5c937f55c --- /dev/null +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_model_HW192.py @@ -0,0 +1,453 @@ +#!/usr/bin/env python +# coding=utf-8 + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""ResNet50 model for Keras. + +Adapted from tf.keras.applications.resnet50.ResNet50(). +This is ResNet model version 1.5. + +Related papers/blogs: +- https://arxiv.org/abs/1512.03385 +- https://arxiv.org/pdf/1603.05027v2.pdf +- http://torch.ch/blog/2016/02/04/resnets.html + +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import flags +import tensorflow as tf + +import imagenet_preprocessing_HW192 +from keras import backend +from keras import initializers +from keras import layers as tf_python_keras_layers +from keras import models +from keras import regularizers + +BATCH_NORM_DECAY = 0.9 +BATCH_NORM_EPSILON = 1e-5 + +FLAGS = flags.FLAGS + +flags.DEFINE_float( + 'weight_decay', + default=1e-4, + help=('Weight decay coefficiant for l2 regularization.')) + +flags.DEFINE_integer( + 'num_accumulation_steps', + default=8, + help=('number of steps to accumulate with large batch size.')) + +layers = tf_python_keras_layers + + +def change_keras_layer(use_tf_keras_layers=False): + """Change layers to either tf.keras.layers or tf.python.keras.layers. + + Layer version of tf.keras.layers is depends on tensorflow version, but + tf.python.keras.layers checks environment variable TF2_BEHAVIOR. + This function is a temporal function to use tf.keras.layers. + Currently, tf v2 batchnorm layer is slower than tf v1 batchnorm layer. + this function is useful for tracking benchmark result for each version. + This function will be removed when we use tf.keras.layers as default. + + TODO(b/146939027): Remove this function when tf v2 batchnorm reaches training + speed parity with tf v1 batchnorm. + + Args: + use_tf_keras_layers: whether to use tf.keras.layers. + """ + global layers + if use_tf_keras_layers: + layers = tf.keras.layers + else: + layers = tf_python_keras_layers + + +def _gen_l2_regularizer(use_l2_regularizer=True): + return regularizers.l2(FLAGS.weight_decay) if use_l2_regularizer else None + + +def identity_block(input_tensor, + kernel_size, + filters, + stage, + block, + use_l2_regularizer=True): + """The identity block is the block that has no conv layer at shortcut. + + Args: + input_tensor: input tensor + kernel_size: default 3, the kernel size of middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + block: 'a','b'..., current block label, used for generating layer names + use_l2_regularizer: whether to use L2 regularizer on Conv layer. + + Returns: + Output tensor for the block. + """ + filters1, filters2, filters3 = filters + if backend.image_data_format() == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + + x = layers.Conv2D( + filters1, (1, 1), + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2a')( + input_tensor) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2a')( + x) + x = layers.Activation('relu')(x) + + x = layers.Conv2D( + filters2, + kernel_size, + padding='same', + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2b')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2b')( + x) + x = layers.Activation('relu')(x) + + x = layers.Conv2D( + filters3, (1, 1), + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2c')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2c')( + x) + + x = layers.add([x, input_tensor]) + x = layers.Activation('relu')(x) + return x + + +def conv_block(input_tensor, + kernel_size, + filters, + stage, + block, + strides=(2, 2), + use_l2_regularizer=True): + """A block that has a conv layer at shortcut. + + Note that from stage 3, + the second conv layer at main path is with strides=(2, 2) + And the shortcut should have strides=(2, 2) as well + + Args: + input_tensor: input tensor + kernel_size: default 3, the kernel size of middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + block: 'a','b'..., current block label, used for generating layer names + strides: Strides for the second conv layer in the block. + use_l2_regularizer: whether to use L2 regularizer on Conv layer. + + Returns: + Output tensor for the block. + """ + filters1, filters2, filters3 = filters + if backend.image_data_format() == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + + x = layers.Conv2D( + filters1, (1, 1), + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2a')( + input_tensor) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2a')( + x) + x = layers.Activation('relu')(x) + + x = layers.Conv2D( + filters2, + kernel_size, + strides=strides, + padding='same', + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2b')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2b')( + x) + x = layers.Activation('relu')(x) + + x = layers.Conv2D( + filters3, (1, 1), + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2c')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2c')( + x) + + shortcut = layers.Conv2D( + filters3, (1, 1), + strides=strides, + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '1')( + input_tensor) + shortcut = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '1')( + shortcut) + + x = layers.add([x, shortcut]) + x = layers.Activation('relu')(x) + return x + + +def resnet50(num_classes, + batch_size=None, + use_l2_regularizer=True, + rescale_inputs=False): + """Instantiates the ResNet50 architecture. + + Args: + num_classes: `int` number of classes for image classification. + batch_size: Size of the batches for each step. + use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer. + rescale_inputs: whether to rescale inputs from 0 to 1. + + Returns: + A Keras model instance. + """ + input_shape = (None, None, 3) + img_input = layers.Input(shape=input_shape) + if rescale_inputs: + # Hub image modules expect inputs in the range [0, 1]. This rescales these + # inputs to the range expected by the trained model. + x = layers.Lambda( + lambda x: x * 255.0 - backend.constant( + imagenet_preprocessing_HW192.CHANNEL_MEANS, + shape=[1, 1, 3], + dtype=x.dtype), + name='rescale')( + img_input) + else: + x = img_input + + if backend.image_data_format() == 'channels_first': + x = layers.Lambda( + lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)), + name='transpose')(x) + bn_axis = 1 + else: # channels_last + bn_axis = 3 + + x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x) + x = layers.Conv2D( + 64, (7, 7), + strides=(2, 2), + padding='valid', + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name='conv1')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name='bn_conv1')( + x) + x = layers.Activation('relu')(x) + x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x) + + x = conv_block( + x, + 3, [64, 64, 256], + stage=2, + block='a', + strides=(1, 1), + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [64, 64, 256], + stage=2, + block='b', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [64, 64, 256], + stage=2, + block='c', + use_l2_regularizer=use_l2_regularizer) + + x = conv_block( + x, + 3, [128, 128, 512], + stage=3, + block='a', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [128, 128, 512], + stage=3, + block='b', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [128, 128, 512], + stage=3, + block='c', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [128, 128, 512], + stage=3, + block='d', + use_l2_regularizer=use_l2_regularizer) + + x = conv_block( + x, + 3, [256, 256, 1024], + stage=4, + block='a', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='b', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='c', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='d', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='e', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='f', + use_l2_regularizer=use_l2_regularizer) + + x = conv_block( + x, + 3, [512, 512, 2048], + stage=5, + block='a', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [512, 512, 2048], + stage=5, + block='b', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [512, 512, 2048], + stage=5, + block='c', + use_l2_regularizer=use_l2_regularizer) + + rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3] + x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x) + x = layers.Dense( + num_classes, + kernel_initializer=initializers.RandomNormal(stddev=0.01), + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + bias_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name='fc1000')( + x) + + # A softmax that is followed by the model loss must be done cannot be done + # in float16 due to numeric issues. So we pass dtype=float32. + x = layers.Activation('softmax', dtype='float32')(x) + + # Create model. + return models.Model(img_input, x, name='resnet50') \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_runnable_HW192.py b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_runnable_HW192.py new file mode 100644 index 0000000000000000000000000000000000000000..ee7312e6209a0692021a07e5b5dbc0c5313cc153 --- /dev/null +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_runnable_HW192.py @@ -0,0 +1,549 @@ +#!/usr/bin/env python +# coding=utf-8 + +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Runs a ResNet model on the ImageNet dataset using custom training loops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import flags +from absl import logging + +import tensorflow as tf + +from tf2_common.training import standard_runnable +from tf2_common.training import utils +from tf2_common.utils.flags import core as flags_core +from tf2_common.utils.mlp_log import mlp_log +import common +import imagenet_preprocessing_HW192 +import resnet_model_HW192 + +import npu_device as npu + +flags.DEFINE_boolean('trace_warmup', default=False, + help='Whether or not to programmatically capture an Xprof' + ' trace in the warmup loop.') + + +class _UnwrapPreventer(object): + """Wrapper that DistributionStrategy will not unwrap. + + Typically, DistributionStrategy will unwrap values when going from a cross- + replica context to a replica context via `call_for_each_replica`. This class + is a wrapper that DistributionStrategy will not unwrap, so it can be used to + prevent it from unwrapping a value. + + TODO(reedwm): Find/implement a better way of preventing values from being + unwrapped by DistributionStrategy + """ + + __slots__ = ['value'] + + def __init__(self, value): + self.value = value + + +class ResnetRunnable(standard_runnable.StandardRunnableWithWarmup): + """Implements the training and evaluation APIs for Resnet model.""" + + def __init__(self, flags_obj, time_callback): + standard_runnable.StandardRunnableWithWarmup.__init__( + self, + flags_obj.use_tf_while_loop, + flags_obj.use_tf_function) + + self.strategy = tf.distribute.get_strategy() + self.flags_obj = flags_obj + self.dtype = flags_core.get_tf_dtype(flags_obj) + self.time_callback = time_callback + + # Input pipeline related + batch_size = flags_obj.batch_size + if batch_size % self.strategy.num_replicas_in_sync != 0: + raise ValueError( + 'Batch size must be divisible by number of replicas : {}'.format( + self.strategy.num_replicas_in_sync)) + + steps_per_epoch, train_epochs = common.get_num_train_iterations(flags_obj) + if train_epochs > 1: + train_epochs = flags_obj.train_epochs + + # As auto rebatching is not supported in + # `experimental_distribute_datasets_from_function()` API, which is + # required when cloning dataset to multiple workers in eager mode, + # we use per-replica batch size. + self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync) + + self.synthetic_input_fn = common.get_synth_input_fn( + height=imagenet_preprocessing_HW192.DEFAULT_IMAGE_SIZE, + width=imagenet_preprocessing_HW192.DEFAULT_IMAGE_SIZE, + num_channels=imagenet_preprocessing_HW192.NUM_CHANNELS, + num_classes=self.flags_obj.num_classes, + dtype=self.dtype, + drop_remainder=True) + + if self.flags_obj.use_synthetic_data: + self.input_fn = self.synthetic_input_fn + else: + self.input_fn = imagenet_preprocessing_HW192.input_fn + + resnet_model_HW192.change_keras_layer(flags_obj.use_tf_keras_layers) + self.model = resnet_model_HW192.resnet50( + num_classes=self.flags_obj.num_classes, + batch_size=flags_obj.batch_size, + use_l2_regularizer=not flags_obj.single_l2_loss_op) + + self.use_lars_optimizer = False + self.num_accumulation_steps = self.flags_obj.num_accumulation_steps + if self.flags_obj.optimizer == 'LARS': + self.use_lars_optimizer = True + self.optimizer, _ = common.get_optimizer( + flags_obj=flags_obj, + steps_per_epoch=steps_per_epoch, + train_steps=steps_per_epoch * train_epochs) + # Make sure iterations variable is created inside scope. + self.global_step = self.optimizer.iterations + + if self.dtype == tf.float16: + loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) + self.optimizer = ( + tf.keras.mixed_precision.experimental.LossScaleOptimizer( + self.optimizer, loss_scale)) + elif flags_obj.fp16_implementation == 'graph_rewrite': + # `dtype` is still float32 in this case. We built the graph in float32 + # and let the graph rewrite change parts of it float16. + if not flags_obj.use_tf_function: + raise ValueError('--fp16_implementation=graph_rewrite requires ' + '--use_tf_function to be true') + loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) + self.optimizer = ( + tf.train.experimental.enable_mixed_precision_graph_rewrite( + self.optimizer, loss_scale)) + + self.one_hot = False + self.label_smoothing = flags_obj.label_smoothing + if self.label_smoothing and self.label_smoothing > 0: + self.one_hot = True + + from metrics_int32 import CategoricalAccuracyInt32 + + if flags_obj.report_accuracy_metrics: + self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32) + if self.one_hot: + #self.train_accuracy = tf.keras.metrics.CategoricalAccuracy( + # 'train_accuracy', dtype=tf.float32) + self.train_accuracy = CategoricalAccuracyInt32( + 'train_accuracy', dtype=tf.float32) + else: + self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( + 'train_accuracy', dtype=tf.float32) + self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) + else: + self.train_loss = None + self.train_accuracy = None + self.test_loss = None + + if self.one_hot: + #self.test_accuracy = tf.keras.metrics.CategoricalAccuracy( + # 'test_accuracy', dtype=tf.float32) + self.test_accuracy = CategoricalAccuracyInt32( + 'test_accuracy', dtype=tf.float32) + else: + self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( + 'test_accuracy', dtype=tf.float32) + # self.test_corrects = tf.keras.metrics.Sum( + # 'test_corrects', dtype=tf.float32) + self.num_eval_steps = common.get_num_eval_steps(flags_obj) + + self.checkpoint = tf.train.Checkpoint( + model=self.model, optimizer=self.optimizer) + + # Handling epochs. + self.epoch_steps = steps_per_epoch + self.epoch_helper = utils.EpochHelper(steps_per_epoch, self.global_step) + + self.steps_per_loop = flags_obj.steps_per_loop + profile_steps = flags_obj.profile_steps + if profile_steps: + profile_steps = [int(i) for i in profile_steps.split(',')] + self.trace_start_step = profile_steps[0] if profile_steps[0] >= 0 else None + self.trace_end_step = profile_steps[1] + else: + self.trace_start_step = None + self.trace_end_step = None + + self.epochs_between_evals = flags_obj.epochs_between_evals + self.training_vars = self.model.trainable_variables + + npu.distribute.broadcast(self.training_vars, root_rank=0) + + self.accum_grads = [] + self.accum_grads_dtype = tf.float32 + + if self.num_accumulation_steps > 1: + for var in self.training_vars: + self.accum_grads.append(self.optimizer.add_weight( + name=var.name + '_accum', + shape=var.shape, + dtype=self.accum_grads_dtype, + initializer='zeros', + trainable=False, + synchronization=tf.VariableSynchronization.ON_READ, + aggregation=tf.VariableAggregation.SUM)) + + def build_train_dataset(self): + """See base class.""" + return utils.make_distributed_dataset( + self.strategy, + self.input_fn, + is_training=True, + data_dir=self.flags_obj.data_dir, + batch_size=self.batch_size, + datasets_num_private_threads=self.flags_obj + .datasets_num_private_threads, + dtype=self.dtype, + drop_remainder=self.flags_obj.drop_train_remainder, + tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack, + dataset_cache=self.flags_obj.training_dataset_cache, + prefetch_batchs=self.flags_obj.training_prefetch_batchs) + + def build_eval_dataset(self): + """See base class.""" + return utils.make_distributed_dataset( + self.strategy, + self.input_fn, + is_training=False, + data_dir=self.flags_obj.data_dir, + batch_size=self.batch_size, + datasets_num_private_threads=self.flags_obj + .datasets_num_private_threads, + dtype=self.dtype, + drop_remainder=self.flags_obj.drop_eval_remainder, + tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack, + dataset_cache=self.flags_obj.eval_dataset_cache, + prefetch_batchs=self.flags_obj.eval_prefetch_batchs) + + def build_synthetic_dataset(self): + """See base class.""" + return utils.make_distributed_dataset( + self.strategy, + self.synthetic_input_fn, + is_training=True, + data_dir=self.flags_obj.data_dir, + batch_size=self.batch_size, + datasets_num_private_threads=self.flags_obj + .datasets_num_private_threads, + dtype=self.dtype, + drop_remainder=self.flags_obj.drop_train_remainder, + tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack, + dataset_cache=self.flags_obj.training_dataset_cache, + prefetch_batchs=self.flags_obj.training_prefetch_batchs) + + def train_loop_begin(self): + """See base class.""" + # Reset all metrics + if self.train_loss: + self.train_loss.reset_states() + if self.train_accuracy: + self.train_accuracy.reset_states() + + self._epoch_begin() + if self.trace_start_step: + global_step = self.global_step.numpy() + next_global_step = global_step + self.steps_per_loop + if (global_step <= self.trace_start_step and + self.trace_start_step < next_global_step): + self.trace_start(global_step) + + self.time_callback.on_batch_begin(self.epoch_helper.batch_index) + + def train_step(self, iterator): + """See base class.""" + + @tf.function(experimental_compile=True) + def local_step(images, labels): + """Local computation of a step.""" + + with tf.GradientTape() as tape: + logits = self.model(images, training=True) + + if self.one_hot: + prediction_loss = tf.keras.losses.categorical_crossentropy( + labels, logits, label_smoothing=self.label_smoothing) + else: + prediction_loss = tf.keras.losses.sparse_categorical_crossentropy( + labels, logits) + loss = tf.reduce_mean(prediction_loss) * 512.0 + # 1.0 / self.flags_obj.batch_size) + + # Save ~3 seconds per epoch on GPU when skipping + # L2 loss computation; can only skip when using LARS + # Details in decription of cl/308018913 + if not self.use_lars_optimizer: + num_replicas = self.strategy.num_replicas_in_sync + + if self.flags_obj.single_l2_loss_op: + l2_loss = self.flags_obj.weight_decay * 2 * tf.add_n([ + tf.nn.l2_loss(v) + for v in self.model.trainable_variables + if 'bn' not in v.name and 'batch_normalization' not in v.name and 'bias' not in v.name + ]) + + #loss += (l2_loss / num_replicas) + loss += (l2_loss * 512.0 / num_replicas) + else: + #loss += (tf.reduce_sum(self.model.losses) / num_replicas) + loss += (tf.reduce_sum(self.model.losses) * 512.0 / num_replicas) + + # Scale the loss + if self.flags_obj.dtype == 'fp16': + loss = self.optimizer.get_scaled_loss(loss) + + #grads = tape.gradient(loss, self.model.trainable_variables) + grads = [grad * (1.0 / 512.0) for grad in tape.gradient(loss, self.model.trainable_variables)] + + # Unscale the grads + if self.flags_obj.dtype == 'fp16': + grads = self.optimizer.get_unscaled_gradients(grads) + + return logits, loss, grads + + def _maybe_apply_grads_and_clear(distribution): + def _apply_grads_and_clear_for_each_replica(): + local_replica_id = tf.get_static_value( + self.strategy.extended._get_local_replica_id( + tf.distribute.get_replica_context().replica_id_in_sync_group)) + replica_accum_grads = [] + for accum_grad, var in zip(self.accum_grads, self.training_vars): + local_accum_grad = self.strategy.experimental_local_results( + accum_grad) + replica_accum_grad = local_accum_grad[local_replica_id] + replica_accum_grad = tf.cast(replica_accum_grad, var.dtype) + replica_accum_grads.append(replica_accum_grad) + + replica_accum_grads = npu.distribute.all_reduce(replica_accum_grads, "mean") + + self.optimizer.apply_gradients( + zip(replica_accum_grads, self.training_vars)) + for accum_grad in self.accum_grads: + accum_grad.assign(tf.zeros_like(accum_grad, + dtype=self.accum_grads_dtype), + read_value=False) + def _apply_grads_and_clear(): + distribution.extended.call_for_each_replica( + _apply_grads_and_clear_for_each_replica, + args=()) + return self.optimizer.iterations.assign_add(0, read_value=False) + + def _advance_iteration(): + return self.optimizer.iterations.assign_add(1, read_value=False) + + tf.cond( + tf.equal(self.optimizer.iterations % self.num_accumulation_steps, + self.num_accumulation_steps - 1), + _apply_grads_and_clear, + _advance_iteration) + + def step_fn(inputs): + """Function to run on the device.""" + images, labels = inputs + logits, loss, grads = local_step(images, labels) + + if self.num_accumulation_steps > 1: + for grad, accum_grad in zip(grads, self.accum_grads): + accum_grad.assign_add(tf.cast(grad, self.accum_grads_dtype), + read_value=False) + tf.distribute.get_replica_context().merge_call( + _maybe_apply_grads_and_clear, + args=()) + else: + grads = npu.distribute.all_reduce(grads, "mean") + self.optimizer.apply_gradients(zip(grads, self.training_vars)) + + if self.train_loss: + self.train_loss.update_state(loss) + if self.train_accuracy: + self.train_accuracy.update_state(labels, logits) + + self.strategy.run(step_fn, args=(next(iterator),)) + + def train_loop_end(self): + """See base class.""" + metrics = {} + if self.train_loss: + metrics['train_loss'] = self.train_loss.result() + if self.train_accuracy: + metrics['train_accuracy'] = self.train_accuracy.result() + + self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1) + + if self.trace_end_step: + global_step = self.global_step.numpy() + next_global_step = global_step + self.steps_per_loop + if (global_step <= self.trace_end_step and + self.trace_end_step < next_global_step): + self.trace_end(global_step) + + self._epoch_end() + return metrics + + def eval_begin(self): + """See base class.""" + if self.test_loss: + self.test_loss.reset_states() + if self.test_accuracy: + self.test_accuracy.reset_states() + # self.test_corrects.reset_states() + + epoch_num = int(self.epoch_helper.current_epoch) + mlp_log.mlperf_print('eval_start', None, + metadata={'epoch_num': epoch_num + 1}) + + def eval_step(self, iterator): + """See base class.""" + + def step_fn(inputs): + """Function to run on the device.""" + images, labels = inputs + logits = self.model(images, training=False) + + if self.test_loss: + if self.one_hot: + loss = tf.keras.losses.categorical_crossentropy( + labels, logits, label_smoothing=self.label_smoothing) + else: + loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits) + loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size) + self.test_loss.update_state(loss) + + if self.test_accuracy: + self.test_accuracy.update_state(labels, logits) + # tf.print('labels.shape: ', labels.shape, + # ', logits.shape: ', logits.shape, + # ', result: ', self.test_accuracy.result()) + # self.test_corrects.update_state( + # tf.cast( + # tf.reduce_sum( + # tf.cast( + # tf.equal( + # tf.cast(tf.argmax(logits, axis=1), labels.dtype), + # labels), tf.int32)), tf.float32)) + + self.strategy.run(step_fn, args=(next(iterator),)) + + def eval_end(self): + """See base class.""" + epoch_num = int(self.epoch_helper.current_epoch) + mlp_log.mlperf_print('eval_stop', None, + metadata={'epoch_num': epoch_num + 1}) + + eval_accuracy = float(self.test_accuracy.result()) + # eval_accuracy = float(self.test_corrects.result() + # ) / imagenet_preprocessing.NUM_IMAGES['validation'] + # eval_accuracy = float(self.test_accuracy.result()) * \ + # self.flags_obj.batch_size * self.num_eval_steps / \ + # imagenet_preprocessing.NUM_IMAGES['validation'] + mlp_log.mlperf_print( + 'eval_accuracy', eval_accuracy, metadata={'epoch_num': epoch_num + 1}) + + first_epoch_num = max(epoch_num - self.epochs_between_evals + 1, 0) + epoch_count = self.epochs_between_evals + if first_epoch_num == 0: + epoch_count = self.flags_obj.eval_offset_epochs + if epoch_count == 0: + epoch_count = self.flags_obj.epochs_between_evals + mlp_log.mlperf_print( + 'block_stop', + None, + metadata={ + 'first_epoch_num': first_epoch_num + 1, + 'epoch_count': epoch_count + }) + + continue_training = True + if eval_accuracy >= self.flags_obj.target_accuracy: + continue_training = False + else: + mlp_log.mlperf_print( + 'block_start', + None, + metadata={ + 'first_epoch_num': epoch_num + 2, + 'epoch_count': self.epochs_between_evals + }) + + results = {} + if self.test_loss: + results['test_loss'] = self.test_loss.result() + if self.test_accuracy: + y = npu.distribute.all_reduce([self.test_accuracy.result()], "mean")[0] + results['test_accuracy'] = y + #results['test_accuracy'] = self.test_accuracy.result() + results['continue_training'] = continue_training + return results + + def warmup_loop_begin(self): + """See base class.""" + if self.flags_obj.trace_warmup: + self.trace_start(-3) + logging.info('Entering the warmup loop.') + + def warmup_loop_end(self): + """See base class.""" + if self.flags_obj.trace_warmup: + self.trace_end(-2) + # Reset the state + self.model.reset_states() + tf.keras.backend.set_value(self.optimizer.iterations, 0) + for accum_grad in self.accum_grads: + accum_grad.assign(tf.zeros_like(accum_grad, + dtype=self.accum_grads_dtype), + read_value=False) + logging.info('Exiting the warmup loop.') + + def _epoch_begin(self): + if self.epoch_helper.epoch_begin(): + self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch) + + def _epoch_end(self): + # mlp_log.mlperf_print('epoch_stop', None) + if self.epoch_helper.epoch_end(): + self.time_callback.on_epoch_end(self.epoch_helper.current_epoch) + + def trace_start(self, global_step): + logging.info('Starting tracing at step %d.', global_step) + tf.profiler.experimental.start(self.flags_obj.model_dir) + + def trace_end(self, global_step): + logging.info('Ending trace at step %d', global_step) + tf.profiler.experimental.stop() \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/ train_performance_1p_HW192.sh b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/ train_performance_1p_HW192.sh new file mode 100644 index 0000000000000000000000000000000000000000..af86411c3303a641c5c5bbd234768482c04620a3 --- /dev/null +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/ train_performance_1p_HW192.sh @@ -0,0 +1,197 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="ResNet50_ID0360_for_TensorFlow2.X" +#训练epoch +train_epochs=2 +#训练batch_size +batch_size=256 +#训练step +train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=0.495 + +#TF2.X独有,需要模型审视修改 +export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../tensorflow +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + cpustep=`expr $cpucount / 8` + echo "taskset c steps:" $cpustep + let a=RANK_ID*$cpustep + let b=RANK_ID+1 + let c=b*$cpustep-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup taskset -c $a-$c python3 resnet_ctl_imagenet_main_HW192.py \ + --data_dir=${data_path} \ + --num_accumulation_steps=1 \ + --train_steps=${train_steps} \ + --train_epochs=${train_epochs} \ + --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ + --distribution_strategy=off \ + --use_tf_while_loop=true \ + --use_tf_function=true \ + --enable_checkpoint_and_export \ + --steps_per_loop=${train_steps} \ + --base_learning_rate=${learning_rate} \ + --momentum=0.901 \ + --epochs_between_evals=1 \ + --eval_offset_epochs=2 \ + --optimizer=SGD \ + --label_smoothing=0.1 \ + --single_l2_loss_op \ + --warmup_epochs=5 \ + --weight_decay=0.000025 \ + --lr_schedule=polynomial \ + --drop_eval_remainder=True \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --batch_size=${batch_size} \ + --profiling=${profiling} \ + --profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'| sed 's/,//g' |cut -c 1-5` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep train_loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +sed -i "/AttributeError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log +sed -i "/MLL/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_8p_256bs_SGD_HW192.sh b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_8p_256bs_SGD_HW192.sh new file mode 100644 index 0000000000000000000000000000000000000000..2b156626954a31c3b61a95d944b4d3ffd14bde51 --- /dev/null +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_8p_256bs_SGD_HW192.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export RANK_SIZE=8 +export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json +export JOB_ID=10087 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="ResNet50_ID0360_for_TensorFlow2.X" +#训练epoch +train_epochs=12 +#训练batch_size +batch_size=2048 +#训练step +train_steps=`expr 1281167 / ${batch_size}` +#学习率 +learning_rate=3.96 + +#TF2.X独有,需要模型审视修改 +export NPU_LOOP_SIZE=${train_steps} + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_8P.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#执行训练,需要模型审视修改 +cd $cur_path/../tensorflow +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + cpustep=`expr $cpucount / 8` + echo "taskset c steps:" $cpustep + let a=RANK_ID*$cpustep + let b=RANK_ID+1 + let c=b*$cpustep-1 + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + nohup ${bind_core} python3 resnet_ctl_imagenet_main_HW192.py \ + --data_dir=${data_path} \ + --train_steps=${train_steps} \ + --num_accumulation_steps=1 \ + --train_epochs=${train_epochs} \ + --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \ + --distribution_strategy=off \ + --use_tf_while_loop=true \ + --use_tf_function=true \ + --enable_checkpoint_and_export \ + --steps_per_loop=${train_steps} \ + --base_learning_rate=${learning_rate} \ + --momentum=0.901 \ + --epochs_between_evals=1 \ + --eval_offset_epochs=2 \ + --optimizer=SGD \ + --label_smoothing=0.1 \ + --single_l2_loss_op \ + --warmup_epochs=5 \ + --weight_decay=0.000025 \ + --lr_schedule=polynomial \ + --drop_eval_remainder=True \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --batch_size=${batch_size} \ + --profiling=${profiling} \ + --profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $6}'|tail -n +7|awk '{sum+=$1} END {print sum/NR}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'| sed 's/,//g' |cut -c 1-5` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep train_loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + sed -i "/AttributeError/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log + sed -i "/MLL/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log +done \ No newline at end of file