diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/imagenet_preprocessing_HW192.py b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/imagenet_preprocessing_HW192.py
new file mode 100644
index 0000000000000000000000000000000000000000..16294a2c94e87b07f9cf03714af5b52e3180cc96
--- /dev/null
+++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/imagenet_preprocessing_HW192.py
@@ -0,0 +1,629 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Provides utilities to preprocess images.
+
+Training images are sampled using the provided bounding boxes, and subsequently
+cropped to the sampled bounding box. Images are additionally flipped randomly,
+then resized to the target output size (without aspect-ratio preservation).
+
+Images used during evaluation are resized (with aspect-ratio preservation) and
+centrally cropped.
+
+All images undergo mean color subtraction.
+
+Note that these steps are colloquially referred to as "ResNet preprocessing,"
+and they differ from "VGG preprocessing," which does not use bounding boxes
+and instead does an aspect-preserving resize followed by random crop during
+training. (These both differ from "Inception preprocessing," which introduces
+color distortion steps.)
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+DEFAULT_IMAGE_SIZE = 192
+EVAL_IMAGE_SIZE = 224
+NUM_CHANNELS = 3
+
+NUM_IMAGES = {
+    'train': 1281167,
+    'validation': 50000,
+}
+
+_NUM_TRAIN_FILES = 1024
+_SHUFFLE_BUFFER = 10000
+
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+
+# The lower bound for the smallest side of the image for aspect-preserving
+# resizing. For example, if an image is 500 x 1000, it will be resized to
+# _RESIZE_MIN x (_RESIZE_MIN * 2).
+_RESIZE_MIN = 256
+
+FLAGS = flags.FLAGS
+
+
+def process_record_dataset(dataset,
+                           is_training,
+                           batch_size,
+                           shuffle_buffer,
+                           dtype=tf.float32,
+                           datasets_num_private_threads=None,
+                           drop_remainder=False,
+                           tf_data_experimental_slack=False,
+                           prefetch_batchs=tf.data.experimental.AUTOTUNE):
+  """Given a Dataset with raw records, return an iterator over the records.
+
+  Args:
+    dataset: A Dataset representing raw records
+    is_training: A boolean denoting whether the input is for training.
+    batch_size: The number of samples per batch.
+    shuffle_buffer: The buffer size to use when shuffling records. A larger
+      value results in better randomness, but smaller values reduce startup
+      time and use less memory.
+    dtype: Data type to use for images/features.
+    datasets_num_private_threads: Number of threads for a private
+      threadpool created for all datasets computation.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's
+      `experimental_slack` option.
+    prefetch_batchs: The number of batchs to prefetch.
+
+  Returns:
+    Dataset of (image, label) pairs ready for iteration.
+  """
+  # Defines a specific size thread pool for tf.data operations.
+  if datasets_num_private_threads:
+    options = tf.data.Options()
+    options.experimental_threading.private_threadpool_size = (
+        datasets_num_private_threads)
+    dataset = dataset.with_options(options)
+    logging.info(
+        'datasets_num_private_threads: %s', datasets_num_private_threads)
+
+  if is_training:
+    # Shuffles records before repeating to respect epoch boundaries.
+    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
+    # Repeats the dataset for the number of epochs to train.
+    dataset = dataset.repeat()
+
+  one_hot = False
+  num_classes = FLAGS.num_classes
+  if FLAGS.label_smoothing and FLAGS.label_smoothing > 0:
+    one_hot = True
+
+  logging.info('Num classes: %d', num_classes)
+  logging.info('One hot: %s', one_hot)
+  if is_training and FLAGS.cache_decoded_image:
+    parse_record_fn = preprocess_parsed_example
+  else:
+    parse_record_fn = parse_and_preprocess_record
+
+  map_fn = functools.partial(
+      parse_record_fn,
+      is_training=is_training,
+      dtype=dtype,
+      num_classes=num_classes,
+      one_hot=one_hot)
+
+  # Parses the raw records into images and labels.
+  #dataset = dataset.map(
+  #    map_fn,
+  #    num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.map(
+      map_fn,
+      num_parallel_calls=12)
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+
+  # Operations between the final prefetch and the get_next call to the iterator
+  # will happen synchronously during run time. We prefetch here again to
+  # background all of the above processing work and keep it out of the
+  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
+  # allows DistributionStrategies to adjust how many batches to fetch based
+  # on how many devices are present.
+  dataset = dataset.prefetch(buffer_size=prefetch_batchs)
+
+  options = tf.data.Options()
+  options.experimental_slack = tf_data_experimental_slack
+  dataset = dataset.with_options(options)
+
+  return dataset
+
+
+def get_filenames(is_training, data_dir):
+  """Return filenames for dataset."""
+  if is_training:
+    return [
+        os.path.join(data_dir, 'train-%05d-of-01024' % i)
+        for i in range(_NUM_TRAIN_FILES)]
+  else:
+    return [
+        os.path.join(data_dir, 'validation-%05d-of-00128' % i)
+        for i in range(128)]
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields (values are included as examples):
+
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
+                                             default_value=''),
+      'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64,
+                                                 default_value=-1),
+      'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string,
+                                                default_value=''),
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+      {k: sparse_float32 for k in [
+          'image/object/bbox/xmin', 'image/object/bbox/ymin',
+          'image/object/bbox/xmax', 'image/object/bbox/ymax']})
+
+  features = tf.io.parse_single_example(serialized=example_serialized,
+                                        features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
+
+  return features['image/encoded'], label, bbox
+
+
+def parse_example_proto_and_decode(example_serialized):
+  """Parses an example and decodes the image to prepare for caching."""
+  image_buffer, label, bbox = parse_example_proto(example_serialized)
+  image_buffer = tf.reshape(image_buffer, shape=[])
+  image_buffer = tf.io.decode_jpeg(image_buffer, 3)
+  return image_buffer, label, bbox
+
+
+def preprocess_parsed_example(
+    image_buffer, label, bbox, is_training, dtype, num_classes, one_hot=False):
+  """Applies preprocessing steps to the input parsed example."""
+  image = preprocess_image(
+      image_buffer=image_buffer,
+      bbox=bbox,
+      output_height=DEFAULT_IMAGE_SIZE,
+      output_width=DEFAULT_IMAGE_SIZE,
+      num_channels=NUM_CHANNELS,
+      is_training=is_training)
+  image = tf.cast(image, dtype)
+
+  # Subtract one so that labels are in [0, 1000), and cast to float32 for
+  # Keras model.
+  label = tf.reshape(label, shape=[1])
+  label = tf.cast(label, tf.int32)
+  label -= 1
+
+  if one_hot:
+    label = tf.one_hot(label, num_classes)
+    label = tf.reshape(label, [num_classes])
+  else:
+    label = tf.cast(label, tf.float32)
+
+  return image, label
+
+
+def parse_and_preprocess_record(
+    raw_record, is_training, dtype, num_classes, one_hot=False):
+  """Parses and preprocesses a record containing a training example of an image.
+
+  The input record is parsed into a label and image, and the image is passed
+  through preprocessing steps (cropping, flipping, and so on).
+
+  Args:
+    raw_record: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+    is_training: A boolean denoting whether the input is for training.
+    dtype: data type to use for images/features.
+    num_classes: Number of classes for one hot encoding.
+    one_hot: Whether to use one_hot encoding on label.
+
+  Returns:
+    Tuple with processed image tensor in a channel-last format and
+    one-hot-encoded label tensor.
+  """
+  image_buffer, label, bbox = parse_example_proto(raw_record)
+  return preprocess_parsed_example(image_buffer=image_buffer,
+                                   label=label,
+                                   bbox=bbox,
+                                   is_training=is_training,
+                                   dtype=dtype,
+                                   one_hot=one_hot,
+                                   num_classes=num_classes)
+
+
+def input_fn(is_training,
+             data_dir,
+             batch_size,
+             dtype=tf.float32,
+             datasets_num_private_threads=None,
+             input_context=None,
+             drop_remainder=False,
+             tf_data_experimental_slack=False,
+             dataset_cache=False,
+             filenames=None,
+             prefetch_batchs=tf.data.experimental.AUTOTUNE):
+  """Input function which provides batches for train or eval.
+
+  Args:
+    is_training: A boolean denoting whether the input is for training.
+    data_dir: The directory containing the input data.
+    batch_size: The number of samples per batch.
+    dtype: Data type to use for images/features
+    datasets_num_private_threads: Number of private threads for tf.data.
+    input_context: A `tf.distribute.InputContext` object passed in by
+      `tf.distribute.Strategy`.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's
+      `experimental_slack` option.
+    dataset_cache: Whether to cache the dataset on workers.
+       Typically used to improve training performance when training data is in
+       remote storage and can fit into worker memory.
+    filenames: Optional field for providing the file names of the TFRecords.
+    prefetch_batchs: The number of batchs to prefetch.
+
+  Returns:
+    A dataset that can be used for iteration.
+  """
+  if filenames is None:
+    filenames = get_filenames(is_training, data_dir)
+  dataset = tf.data.Dataset.from_tensor_slices(filenames)
+
+  import npu_device as npu
+  dataset, batch_size = npu.distribute.shard_and_rebatch_dataset(dataset, batch_size)
+  #if input_context:
+  #  logging.info(
+  #      'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
+  #      input_context.input_pipeline_id, input_context.num_input_pipelines)
+  #  dataset = dataset.shard(input_context.num_input_pipelines,
+  #                          input_context.input_pipeline_id)
+
+  if is_training:
+    # Shuffle the input files
+    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
+
+  # Convert to individual records.
+  # cycle_length = 10 means that up to 10 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=10,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  if is_training and FLAGS.cache_decoded_image:
+    dataset = dataset.map(
+        parse_example_proto_and_decode,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  if dataset_cache:
+    # Improve training / eval performance when data is in remote storage and
+    # can fit into worker memory.
+    dataset = dataset.cache()
+
+  return process_record_dataset(
+      dataset=dataset,
+      is_training=is_training,
+      batch_size=batch_size,
+      shuffle_buffer=_SHUFFLE_BUFFER,
+      dtype=dtype,
+      datasets_num_private_threads=datasets_num_private_threads,
+      drop_remainder=drop_remainder,
+      tf_data_experimental_slack=tf_data_experimental_slack,
+      prefetch_batchs=prefetch_batchs,
+  )
+
+
+def _decode_crop_and_flip(image_buffer, bbox, num_channels):
+  """Crops the given image to a random part of the image, and randomly flips.
+
+  We use the fused decode_and_crop op, which performs better than the two ops
+  used separately in series, but note that this requires that the image be
+  passed in as an un-decoded string Tensor.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    num_channels: Integer depth of the image buffer for decoding.
+
+  Returns:
+    3-D tensor with cropped image.
+
+  """
+  # A large fraction of image datasets contain a human-annotated bounding box
+  # delineating the region of the image containing the object of interest.  We
+  # choose to create a new bounding box for the object which is a randomly
+  # distorted version of the human-annotated bounding box that obeys an
+  # allowed range of aspect ratios, sizes and overlap with the human-annotated
+  # bounding box. If no box is supplied, then we assume the bounding box is
+  # the entire image.
+  decoded = image_buffer.dtype != tf.string
+  shape = (tf.shape(image_buffer) if decoded
+           else tf.image.extract_jpeg_shape(image_buffer))
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      shape,
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_y, offset_x, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+
+  if decoded:
+    cropped = tf.image.crop_to_bounding_box(
+        image_buffer,
+        offset_height=offset_y,
+        offset_width=offset_x,
+        target_height=target_height,
+        target_width=target_width)
+  else:
+    # Use the fused decode and crop op here, which is faster than sequential.
+    cropped = tf.image.decode_and_crop_jpeg(
+        image_buffer, crop_window, channels=num_channels)
+
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+
+
+def _central_crop(image, crop_height, crop_width):
+  """Performs central crops of the given image list.
+
+  Args:
+    image: a 3-D image tensor
+    crop_height: the height of the image following the crop.
+    crop_width: the width of the image following the crop.
+
+  Returns:
+    3-D tensor with cropped image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  amount_to_be_cropped_h = (height - crop_height)
+  crop_top = amount_to_be_cropped_h // 2
+  amount_to_be_cropped_w = (width - crop_width)
+  crop_left = amount_to_be_cropped_w // 2
+  return tf.slice(
+      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
+
+
+def _mean_image_subtraction(image, means, num_channels):
+  """Subtracts the given means from each image channel.
+
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image = _mean_image_subtraction(image, means)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image))
+
+  return image - means
+
+
+def _smallest_size_at_least(height, width, resize_min):
+  """Computes new shape with the smallest side equal to `smallest_side`.
+
+  Computes new shape with the smallest side equal to `smallest_side` while
+  preserving the original aspect ratio.
+
+  Args:
+    height: an int32 scalar tensor indicating the current height.
+    width: an int32 scalar tensor indicating the current width.
+    resize_min: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+
+  Returns:
+    new_height: an int32 scalar tensor indicating the new height.
+    new_width: an int32 scalar tensor indicating the new width.
+  """
+  resize_min = tf.cast(resize_min, tf.float32)
+
+  # Convert to floats to make subsequent calculations go smoothly.
+  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+
+  smaller_dim = tf.minimum(height, width)
+  scale_ratio = resize_min / smaller_dim
+
+  # Convert back to ints to make heights and widths that TF ops will accept.
+  new_height = tf.cast(height * scale_ratio, tf.int32)
+  new_width = tf.cast(width * scale_ratio, tf.int32)
+
+  return new_height, new_width
+
+
+def _aspect_preserving_resize(image, resize_min):
+  """Resize images preserving the original aspect ratio.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    resize_min: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+
+  return _resize_image(image, new_height, new_width)
+
+
+def _resize_image(image, height, width):
+  """Simple wrapper around tf.resize_images.
+
+  This is primarily to make sure we use the same `ResizeMethod` and other
+  details each time.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    height: The target height for the resized image.
+    width: The target width for the resized image.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image. The first two
+      dimensions have the shape [height, width].
+  """
+  return tf.compat.v1.image.resize(
+      image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+
+def preprocess_image(image_buffer, bbox, output_height, output_width,
+                     num_channels, is_training=False):
+  """Preprocesses the given image.
+
+  Preprocessing includes decoding, cropping, and resizing for both training
+  and eval images. Training preprocessing, however, introduces some random
+  distortion of the image to improve accuracy.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    num_channels: Integer depth of the image buffer for decoding.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    # For training, we want to randomize some of the distortions.
+    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
+    image = _resize_image(image, output_height, output_width)
+  else:
+    # For validation, we want to decode, resize, then just crop the middle.
+    if image_buffer.dtype == tf.string:
+      image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
+    else:
+      image = image_buffer
+    output_height = EVAL_IMAGE_SIZE
+    output_width = EVAL_IMAGE_SIZE
+    image = _aspect_preserving_resize(image, _RESIZE_MIN)
+    image = _central_crop(image, output_height, output_width)
+
+  image.set_shape([output_height, output_width, num_channels])
+
+  return _mean_image_subtraction(image, CHANNEL_MEANS, num_channels)
\ No newline at end of file
diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_ctl_imagenet_main_HW192.py b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_ctl_imagenet_main_HW192.py
new file mode 100644
index 0000000000000000000000000000000000000000..077b2c3ead2e862d1049dd5229eae712b60d0dd3
--- /dev/null
+++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_ctl_imagenet_main_HW192.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+from tf2_common.modeling import performance
+from tf2_common.training import controller
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.logs import logger
+from tf2_common.utils.misc import distribution_utils
+from tf2_common.utils.misc import keras_utils
+from tf2_common.utils.misc import model_helpers
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing_HW192
+import resnet_runnable_HW192
+import json
+import npu_device
+
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+flags.DEFINE_boolean(name='cache_decoded_image', default=False,
+                     help='Whether or not to cache decoded images in the '
+                     'input pipeline. If this flag and `cache` is enabled, '
+                     'then TFExample protos will be parsed and then cached '
+                     'which reduces the load on hosts.')
+flags.DEFINE_boolean(name='enable_device_warmup', default=False,
+                     help='Whether or not to enable device warmup. This '
+                     'includes training on dummy data and enabling graph/XLA '
+                     'compilation before run_start.')
+flags.DEFINE_integer(name='device_warmup_steps', default=1,
+                     help='The number of steps to apply for device warmup.')
+flags.DEFINE_integer(name='num_replicas', default=32,
+                     help='The number of TPU cores to use, '
+                     'for log printout only.')
+
+flags.DEFINE_string(name='precision_mode', default= 'allow_mix_precision',
+                    help='allow_fp32_to_fp16/force_fp16/ ' 
+                    'must_keep_origin_dtype/allow_mix_precision.')
+flags.DEFINE_boolean(name='over_dump', default=False,
+                    help='if or not over detection, default is False')
+flags.DEFINE_boolean(name='data_dump_flag', default=False,
+                    help='data dump flag, default is False')
+flags.DEFINE_string(name='data_dump_step', default="10",
+                    help='data dump step, default is 10')
+flags.DEFINE_boolean(name='profiling', default=False,
+                    help='if or not profiling for performance debug, default is False') 
+flags.DEFINE_string(name='profiling_dump_path', default="/home/data",
+                    help='the path to save profiling data')                                      
+flags.DEFINE_string(name='over_dump_path', default="/home/data",
+                    help='the path to save over dump data')  
+flags.DEFINE_string(name='data_dump_path', default="/home/data",
+                    help='the path to save dump data')     
+flags.DEFINE_boolean(name='autotune', default=False,
+                    help='whether to enable autotune, default is False')                                     
+
+def npu_config():
+  FLAGS = flags.FLAGS
+  npu_config = {}
+
+  if FLAGS.data_dump_flag:
+    npu_device.global_options().dump_config.enable_dump = True
+    npu_device.global_options().dump_config.dump_path = FLAGS.data_dump_path
+    npu_device.global_options().dump_config.dump_step = FLAGS.data_dump_step
+    npu_device.global_options().dump_config.dump_mode = "all"
+
+  if FLAGS.over_dump:
+    npu_device.global_options().dump_config.enable_dump_debug = True
+    npu_device.global_options().dump_config.dump_path = FLAGS.over_dump_path
+    npu_device.global_options().dump_config.dump_debug_mode = "all"
+
+  if FLAGS.profiling:
+    npu_device.global_options().profiling_config.enable_profiling = True
+    profiling_options = '{"output":"' + FLAGS.profiling_dump_path + '", \
+                        "training_trace":"on", \
+                        "task_trace":"on", \
+                        "aicpu":"on", \
+                        "fp_point":"while1While_body_while_body_10958_1/while/resnet50/conv1/Conv2Dwhile1While_body_while_body_10958_1/while/resnet50/bn_conv1/FusedBatchNormV3_Reduce", \
+                        "bp_point":"while1While_body_while_body_10958_1/gradient_tape/while/resnet50/bn5c_branch2c/FusedBatchNormGradV3_Update"}'
+    npu_device.global_options().profiling_config.profiling_options = profiling_options
+  npu_device.global_options().precision_mode=FLAGS.precision_mode
+  npu_device.open().as_default()
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    if runnable.test_loss:
+      stats['eval_loss'] = runnable.test_loss.result().numpy()
+    if runnable.test_accuracy:
+      stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    if runnable.train_loss:
+      stats['train_loss'] = runnable.train_loss.result().numpy()
+    if runnable.train_accuracy:
+      stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  mlp_log.mlperf_print('cache_clear', True)
+  mlp_log.mlperf_print('init_start', None)
+  mlp_log.mlperf_print('submission_benchmark', 'resnet')
+  mlp_log.mlperf_print('submission_division', 'closed')
+  mlp_log.mlperf_print('submission_org', 'google')
+  mlp_log.mlperf_print(
+      'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
+      if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
+  mlp_log.mlperf_print('submission_status', 'cloud')
+
+  npu_config()
+
+  common.print_flags(flags_obj)
+
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus)
+      if not flags_obj.datasets_num_private_threads:
+        flags_obj.datasets_num_private_threads = datasets_num_private_threads
+    common.set_cudnn_batchnorm_mode()
+
+  # TODO(anj-s): Set data_format without using Keras.
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first'
+                   if tf.test.is_built_with_cuda() else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu,
+      tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
+  mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
+  mlp_log.mlperf_print('train_samples',
+                       imagenet_preprocessing_HW192.NUM_IMAGES['train'])
+  mlp_log.mlperf_print('eval_samples',
+                       imagenet_preprocessing_HW192.NUM_IMAGES['validation'])
+  mlp_log.mlperf_print(
+      'model_bn_span',
+      int(flags_obj.batch_size /
+          (flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
+
+  per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
+  eval_steps = common.get_num_eval_steps(flags_obj)
+  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribution_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable_HW192.ResnetRunnable(flags_obj, time_callback)
+
+  eval_interval = (
+      flags_obj.epochs_between_evals *
+      per_epoch_steps if not flags_obj.skip_eval else None)
+  eval_offset = (
+      flags_obj.eval_offset_epochs *
+      per_epoch_steps if not flags_obj.skip_eval else 0)
+  if eval_offset != 0:
+    eval_offset -= eval_interval
+  checkpoint_interval = (
+      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  device_warmup_steps = (flags_obj.device_warmup_steps
+                         if flags_obj.enable_device_warmup else 0)
+  if flags_obj.enable_device_warmup:
+    logging.info('Warmup for %d steps.', device_warmup_steps)
+
+  resnet_controller = controller.Controller(
+      strategy,
+      runnable.train,
+      runnable.evaluate,
+      runnable.warmup,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      train_steps=per_epoch_steps * train_epochs,
+      device_warmup_steps=device_warmup_steps,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      eval_steps=eval_steps,
+      eval_interval=eval_interval,
+      eval_offset=eval_offset)
+
+  if flags_obj.enable_device_warmup:
+    resnet_controller.warmup()
+
+  mlp_log.mlperf_print('init_stop', None)
+
+  profile_steps = flags_obj.profile_steps
+  if profile_steps:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+    if profile_steps[0] < 0:
+      runnable.trace_start(-1)
+
+  time_callback.on_train_begin()
+  mlp_log.mlperf_print('run_start', None)
+  mlp_log.mlperf_print(
+      'block_start',
+      None,
+      metadata={
+          'first_epoch_num':
+              1,
+          'epoch_count':
+              (flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
+               else flags_obj.epochs_between_evals)
+      })
+  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
+  time_callback.on_train_end()
+  mlp_log.mlperf_print('run_final', None)
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def define_imagenet_keras_flags():
+  common.define_keras_flags()
+  flags_core.set_defaults()
+  flags.adopt_module_key_flags(common)
+
+
+def main(_):
+  model_helpers.apply_clean(flags.FLAGS)
+  with logger.benchmark_context(flags.FLAGS):
+    stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
\ No newline at end of file
diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_model_HW192.py b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_model_HW192.py
new file mode 100644
index 0000000000000000000000000000000000000000..04821d5a80aa14af3e0165fe2b8c8fd5c937f55c
--- /dev/null
+++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_model_HW192.py
@@ -0,0 +1,453 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""ResNet50 model for Keras.
+
+Adapted from tf.keras.applications.resnet50.ResNet50().
+This is ResNet model version 1.5.
+
+Related papers/blogs:
+- https://arxiv.org/abs/1512.03385
+- https://arxiv.org/pdf/1603.05027v2.pdf
+- http://torch.ch/blog/2016/02/04/resnets.html
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+import tensorflow as tf
+
+import imagenet_preprocessing_HW192
+from keras import backend
+from keras import initializers
+from keras import layers as tf_python_keras_layers
+from keras import models
+from keras import regularizers
+
+BATCH_NORM_DECAY = 0.9
+BATCH_NORM_EPSILON = 1e-5
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_float(
+    'weight_decay',
+    default=1e-4,
+    help=('Weight decay coefficiant for l2 regularization.'))
+
+flags.DEFINE_integer(
+    'num_accumulation_steps',
+    default=8,
+    help=('number of steps to accumulate with large batch size.'))
+
+layers = tf_python_keras_layers
+
+
+def change_keras_layer(use_tf_keras_layers=False):
+  """Change layers to either tf.keras.layers or tf.python.keras.layers.
+
+  Layer version of  tf.keras.layers is depends on tensorflow version, but
+  tf.python.keras.layers checks environment variable TF2_BEHAVIOR.
+  This function is a temporal function to use tf.keras.layers.
+  Currently, tf v2 batchnorm layer is slower than tf v1 batchnorm layer.
+  this function is useful for tracking benchmark result for each version.
+  This function will be removed when we use tf.keras.layers as default.
+
+  TODO(b/146939027): Remove this function when tf v2 batchnorm reaches training
+  speed parity with tf v1 batchnorm.
+
+  Args:
+      use_tf_keras_layers: whether to use tf.keras.layers.
+  """
+  global layers
+  if use_tf_keras_layers:
+    layers = tf.keras.layers
+  else:
+    layers = tf_python_keras_layers
+
+
+def _gen_l2_regularizer(use_l2_regularizer=True):
+  return regularizers.l2(FLAGS.weight_decay) if use_l2_regularizer else None
+
+
+def identity_block(input_tensor,
+                   kernel_size,
+                   filters,
+                   stage,
+                   block,
+                   use_l2_regularizer=True):
+  """The identity block is the block that has no conv layer at shortcut.
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2c')(
+          x)
+
+  x = layers.add([x, input_tensor])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def conv_block(input_tensor,
+               kernel_size,
+               filters,
+               stage,
+               block,
+               strides=(2, 2),
+               use_l2_regularizer=True):
+  """A block that has a conv layer at shortcut.
+
+  Note that from stage 3,
+  the second conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    strides: Strides for the second conv layer in the block.
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      strides=strides,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2c')(
+          x)
+
+  shortcut = layers.Conv2D(
+      filters3, (1, 1),
+      strides=strides,
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '1')(
+          input_tensor)
+  shortcut = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '1')(
+          shortcut)
+
+  x = layers.add([x, shortcut])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def resnet50(num_classes,
+             batch_size=None,
+             use_l2_regularizer=True,
+             rescale_inputs=False):
+  """Instantiates the ResNet50 architecture.
+
+  Args:
+    num_classes: `int` number of classes for image classification.
+    batch_size: Size of the batches for each step.
+    use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
+    rescale_inputs: whether to rescale inputs from 0 to 1.
+
+  Returns:
+      A Keras model instance.
+  """
+  input_shape = (None, None, 3)
+  img_input = layers.Input(shape=input_shape)
+  if rescale_inputs:
+    # Hub image modules expect inputs in the range [0, 1]. This rescales these
+    # inputs to the range expected by the trained model.
+    x = layers.Lambda(
+        lambda x: x * 255.0 - backend.constant(
+            imagenet_preprocessing_HW192.CHANNEL_MEANS,
+            shape=[1, 1, 3],
+            dtype=x.dtype),
+        name='rescale')(
+            img_input)
+  else:
+    x = img_input
+
+  if backend.image_data_format() == 'channels_first':
+    x = layers.Lambda(
+        lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
+        name='transpose')(x)
+    bn_axis = 1
+  else:  # channels_last
+    bn_axis = 3
+
+  x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
+  x = layers.Conv2D(
+      64, (7, 7),
+      strides=(2, 2),
+      padding='valid',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='conv1')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name='bn_conv1')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
+
+  x = conv_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='a',
+      strides=(1, 1),
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='d',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='d',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='e',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='f',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+
+  rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
+  x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
+  x = layers.Dense(
+      num_classes,
+      kernel_initializer=initializers.RandomNormal(stddev=0.01),
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='fc1000')(
+          x)
+
+  # A softmax that is followed by the model loss must be done cannot be done
+  # in float16 due to numeric issues. So we pass dtype=float32.
+  x = layers.Activation('softmax', dtype='float32')(x)
+
+  # Create model.
+  return models.Model(img_input, x, name='resnet50')
\ No newline at end of file
diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_runnable_HW192.py b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_runnable_HW192.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee7312e6209a0692021a07e5b5dbc0c5313cc153
--- /dev/null
+++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/tensorflow/resnet_runnable_HW192.py
@@ -0,0 +1,549 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+from absl import logging
+
+import tensorflow as tf
+
+from tf2_common.training import standard_runnable
+from tf2_common.training import utils
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing_HW192
+import resnet_model_HW192
+
+import npu_device as npu
+
+flags.DEFINE_boolean('trace_warmup', default=False,
+                     help='Whether or not to programmatically capture an Xprof'
+                     ' trace in the warmup loop.')
+
+
+class _UnwrapPreventer(object):
+  """Wrapper that DistributionStrategy will not unwrap.
+
+  Typically, DistributionStrategy will unwrap values when going from a cross-
+  replica context to a replica context via `call_for_each_replica`. This class
+  is a wrapper that DistributionStrategy will not unwrap, so it can be used to
+  prevent it from unwrapping a value.
+
+  TODO(reedwm): Find/implement a better way of preventing values from being
+  unwrapped by DistributionStrategy
+  """
+
+  __slots__ = ['value']
+
+  def __init__(self, value):
+    self.value = value
+
+
+class ResnetRunnable(standard_runnable.StandardRunnableWithWarmup):
+  """Implements the training and evaluation APIs for Resnet model."""
+
+  def __init__(self, flags_obj, time_callback):
+    standard_runnable.StandardRunnableWithWarmup.__init__(
+        self,
+        flags_obj.use_tf_while_loop,
+        flags_obj.use_tf_function)
+
+    self.strategy = tf.distribute.get_strategy()
+    self.flags_obj = flags_obj
+    self.dtype = flags_core.get_tf_dtype(flags_obj)
+    self.time_callback = time_callback
+
+    # Input pipeline related
+    batch_size = flags_obj.batch_size
+    if batch_size % self.strategy.num_replicas_in_sync != 0:
+      raise ValueError(
+          'Batch size must be divisible by number of replicas : {}'.format(
+              self.strategy.num_replicas_in_sync))
+
+    steps_per_epoch, train_epochs = common.get_num_train_iterations(flags_obj)
+    if train_epochs > 1:
+      train_epochs = flags_obj.train_epochs
+
+    # As auto rebatching is not supported in
+    # `experimental_distribute_datasets_from_function()` API, which is
+    # required when cloning dataset to multiple workers in eager mode,
+    # we use per-replica batch size.
+    self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
+
+    self.synthetic_input_fn = common.get_synth_input_fn(
+        height=imagenet_preprocessing_HW192.DEFAULT_IMAGE_SIZE,
+        width=imagenet_preprocessing_HW192.DEFAULT_IMAGE_SIZE,
+        num_channels=imagenet_preprocessing_HW192.NUM_CHANNELS,
+        num_classes=self.flags_obj.num_classes,
+        dtype=self.dtype,
+        drop_remainder=True)
+
+    if self.flags_obj.use_synthetic_data:
+      self.input_fn = self.synthetic_input_fn
+    else:
+      self.input_fn = imagenet_preprocessing_HW192.input_fn
+
+    resnet_model_HW192.change_keras_layer(flags_obj.use_tf_keras_layers)
+    self.model = resnet_model_HW192.resnet50(
+        num_classes=self.flags_obj.num_classes,
+        batch_size=flags_obj.batch_size,
+        use_l2_regularizer=not flags_obj.single_l2_loss_op)
+
+    self.use_lars_optimizer = False
+    self.num_accumulation_steps = self.flags_obj.num_accumulation_steps
+    if self.flags_obj.optimizer == 'LARS':
+      self.use_lars_optimizer = True
+    self.optimizer, _ = common.get_optimizer(
+        flags_obj=flags_obj,
+        steps_per_epoch=steps_per_epoch,
+        train_steps=steps_per_epoch * train_epochs)
+    # Make sure iterations variable is created inside scope.
+    self.global_step = self.optimizer.iterations
+
+    if self.dtype == tf.float16:
+      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
+      self.optimizer = (
+          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+              self.optimizer, loss_scale))
+    elif flags_obj.fp16_implementation == 'graph_rewrite':
+      # `dtype` is still float32 in this case. We built the graph in float32
+      # and let the graph rewrite change parts of it float16.
+      if not flags_obj.use_tf_function:
+        raise ValueError('--fp16_implementation=graph_rewrite requires '
+                         '--use_tf_function to be true')
+      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
+      self.optimizer = (
+          tf.train.experimental.enable_mixed_precision_graph_rewrite(
+              self.optimizer, loss_scale))
+
+    self.one_hot = False
+    self.label_smoothing = flags_obj.label_smoothing
+    if self.label_smoothing and self.label_smoothing > 0:
+      self.one_hot = True
+
+    from metrics_int32 import CategoricalAccuracyInt32
+
+    if flags_obj.report_accuracy_metrics:
+      self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
+      if self.one_hot:
+        #self.train_accuracy = tf.keras.metrics.CategoricalAccuracy(
+        #    'train_accuracy', dtype=tf.float32)
+        self.train_accuracy = CategoricalAccuracyInt32(
+            'train_accuracy', dtype=tf.float32)  
+      else:
+        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+            'train_accuracy', dtype=tf.float32)
+      self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
+    else:
+      self.train_loss = None
+      self.train_accuracy = None
+      self.test_loss = None
+
+    if self.one_hot:
+      #self.test_accuracy = tf.keras.metrics.CategoricalAccuracy(
+      #    'test_accuracy', dtype=tf.float32)
+      self.test_accuracy = CategoricalAccuracyInt32(
+          'test_accuracy', dtype=tf.float32) 
+    else:
+      self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+          'test_accuracy', dtype=tf.float32)
+    # self.test_corrects = tf.keras.metrics.Sum(
+    #     'test_corrects', dtype=tf.float32)
+    self.num_eval_steps = common.get_num_eval_steps(flags_obj)
+
+    self.checkpoint = tf.train.Checkpoint(
+        model=self.model, optimizer=self.optimizer)
+
+    # Handling epochs.
+    self.epoch_steps = steps_per_epoch
+    self.epoch_helper = utils.EpochHelper(steps_per_epoch, self.global_step)
+
+    self.steps_per_loop = flags_obj.steps_per_loop
+    profile_steps = flags_obj.profile_steps
+    if profile_steps:
+      profile_steps = [int(i) for i in profile_steps.split(',')]
+      self.trace_start_step = profile_steps[0] if profile_steps[0] >= 0 else None
+      self.trace_end_step = profile_steps[1]
+    else:
+      self.trace_start_step = None
+      self.trace_end_step = None
+
+    self.epochs_between_evals = flags_obj.epochs_between_evals
+    self.training_vars = self.model.trainable_variables
+
+    npu.distribute.broadcast(self.training_vars, root_rank=0)
+
+    self.accum_grads = []
+    self.accum_grads_dtype = tf.float32
+
+    if self.num_accumulation_steps > 1:
+      for var in self.training_vars:
+        self.accum_grads.append(self.optimizer.add_weight(
+            name=var.name + '_accum',
+            shape=var.shape,
+            dtype=self.accum_grads_dtype,
+            initializer='zeros',
+            trainable=False,
+            synchronization=tf.VariableSynchronization.ON_READ,
+            aggregation=tf.VariableAggregation.SUM))
+
+  def build_train_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_train_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.training_dataset_cache,
+        prefetch_batchs=self.flags_obj.training_prefetch_batchs)
+
+  def build_eval_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=False,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_eval_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.eval_dataset_cache,
+        prefetch_batchs=self.flags_obj.eval_prefetch_batchs)
+
+  def build_synthetic_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.synthetic_input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_train_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.training_dataset_cache,
+        prefetch_batchs=self.flags_obj.training_prefetch_batchs)
+
+  def train_loop_begin(self):
+    """See base class."""
+    # Reset all metrics
+    if self.train_loss:
+      self.train_loss.reset_states()
+    if self.train_accuracy:
+      self.train_accuracy.reset_states()
+
+    self._epoch_begin()
+    if self.trace_start_step:
+      global_step = self.global_step.numpy()
+      next_global_step = global_step + self.steps_per_loop
+      if (global_step <= self.trace_start_step and
+          self.trace_start_step < next_global_step):
+        self.trace_start(global_step)
+
+    self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
+
+  def train_step(self, iterator):
+    """See base class."""
+
+    @tf.function(experimental_compile=True)
+    def local_step(images, labels):
+      """Local computation of a step."""
+
+      with tf.GradientTape() as tape:
+        logits = self.model(images, training=True)
+
+        if self.one_hot:
+          prediction_loss = tf.keras.losses.categorical_crossentropy(
+              labels, logits, label_smoothing=self.label_smoothing)
+        else:
+          prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
+              labels, logits)
+        loss = tf.reduce_mean(prediction_loss) * 512.0
+        #    1.0 / self.flags_obj.batch_size)
+
+        # Save ~3 seconds per epoch on GPU when skipping
+        # L2 loss computation; can only skip when using LARS
+        # Details in decription of cl/308018913
+        if not self.use_lars_optimizer:
+          num_replicas = self.strategy.num_replicas_in_sync
+
+          if self.flags_obj.single_l2_loss_op:
+            l2_loss = self.flags_obj.weight_decay * 2 * tf.add_n([
+                tf.nn.l2_loss(v)
+                for v in self.model.trainable_variables
+                if 'bn' not in v.name and 'batch_normalization' not in v.name and 'bias' not in v.name
+            ])
+
+            #loss += (l2_loss / num_replicas)
+            loss += (l2_loss * 512.0 / num_replicas)
+          else:
+            #loss += (tf.reduce_sum(self.model.losses) / num_replicas)
+            loss += (tf.reduce_sum(self.model.losses) * 512.0 / num_replicas)
+
+        # Scale the loss
+        if self.flags_obj.dtype == 'fp16':
+          loss = self.optimizer.get_scaled_loss(loss)
+
+      #grads = tape.gradient(loss, self.model.trainable_variables)
+      grads = [grad * (1.0 / 512.0) for grad in tape.gradient(loss, self.model.trainable_variables)]
+
+      # Unscale the grads
+      if self.flags_obj.dtype == 'fp16':
+        grads = self.optimizer.get_unscaled_gradients(grads)
+      
+      return logits, loss, grads
+
+    def _maybe_apply_grads_and_clear(distribution):
+      def _apply_grads_and_clear_for_each_replica():
+        local_replica_id = tf.get_static_value(
+            self.strategy.extended._get_local_replica_id(
+                tf.distribute.get_replica_context().replica_id_in_sync_group))
+        replica_accum_grads = []
+        for accum_grad, var in zip(self.accum_grads, self.training_vars):
+          local_accum_grad = self.strategy.experimental_local_results(
+              accum_grad)
+          replica_accum_grad = local_accum_grad[local_replica_id]
+          replica_accum_grad = tf.cast(replica_accum_grad, var.dtype)
+          replica_accum_grads.append(replica_accum_grad)
+
+        replica_accum_grads = npu.distribute.all_reduce(replica_accum_grads, "mean")
+
+        self.optimizer.apply_gradients(
+            zip(replica_accum_grads, self.training_vars))
+        for accum_grad in self.accum_grads:
+          accum_grad.assign(tf.zeros_like(accum_grad,
+                                          dtype=self.accum_grads_dtype),
+                            read_value=False)
+      def _apply_grads_and_clear():
+        distribution.extended.call_for_each_replica(
+            _apply_grads_and_clear_for_each_replica,
+            args=())
+        return self.optimizer.iterations.assign_add(0, read_value=False)
+
+      def _advance_iteration():
+        return self.optimizer.iterations.assign_add(1, read_value=False)
+
+      tf.cond(
+          tf.equal(self.optimizer.iterations % self.num_accumulation_steps,
+                   self.num_accumulation_steps - 1),
+          _apply_grads_and_clear,
+          _advance_iteration)
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits, loss, grads = local_step(images, labels)
+
+      if self.num_accumulation_steps > 1:
+        for grad, accum_grad in zip(grads, self.accum_grads):
+          accum_grad.assign_add(tf.cast(grad, self.accum_grads_dtype),
+                                read_value=False)
+        tf.distribute.get_replica_context().merge_call(
+            _maybe_apply_grads_and_clear,
+            args=())
+      else:
+        grads = npu.distribute.all_reduce(grads, "mean")
+        self.optimizer.apply_gradients(zip(grads, self.training_vars))
+
+      if self.train_loss:
+        self.train_loss.update_state(loss)
+      if self.train_accuracy:
+        self.train_accuracy.update_state(labels, logits)
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def train_loop_end(self):
+    """See base class."""
+    metrics = {}
+    if self.train_loss:
+      metrics['train_loss'] = self.train_loss.result()
+    if self.train_accuracy:
+      metrics['train_accuracy'] = self.train_accuracy.result()
+
+    self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
+
+    if self.trace_end_step:
+      global_step = self.global_step.numpy()
+      next_global_step = global_step + self.steps_per_loop
+      if (global_step <= self.trace_end_step and
+          self.trace_end_step < next_global_step):
+        self.trace_end(global_step)
+
+    self._epoch_end()
+    return metrics
+
+  def eval_begin(self):
+    """See base class."""
+    if self.test_loss:
+      self.test_loss.reset_states()
+    if self.test_accuracy:
+      self.test_accuracy.reset_states()
+    # self.test_corrects.reset_states()
+
+    epoch_num = int(self.epoch_helper.current_epoch)
+    mlp_log.mlperf_print('eval_start', None,
+                         metadata={'epoch_num': epoch_num + 1})
+
+  def eval_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits = self.model(images, training=False)
+
+      if self.test_loss:
+        if self.one_hot:
+          loss = tf.keras.losses.categorical_crossentropy(
+              labels, logits, label_smoothing=self.label_smoothing)
+        else:
+          loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
+        loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
+        self.test_loss.update_state(loss)
+
+      if self.test_accuracy:
+        self.test_accuracy.update_state(labels, logits)
+        # tf.print('labels.shape: ', labels.shape,
+        #          ', logits.shape: ', logits.shape,
+        #          ', result: ', self.test_accuracy.result())
+      # self.test_corrects.update_state(
+      #     tf.cast(
+      #         tf.reduce_sum(
+      #             tf.cast(
+      #                 tf.equal(
+      #                     tf.cast(tf.argmax(logits, axis=1), labels.dtype),
+      #                     labels), tf.int32)), tf.float32))
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def eval_end(self):
+    """See base class."""
+    epoch_num = int(self.epoch_helper.current_epoch)
+    mlp_log.mlperf_print('eval_stop', None,
+                         metadata={'epoch_num': epoch_num + 1})
+
+    eval_accuracy = float(self.test_accuracy.result())
+    # eval_accuracy = float(self.test_corrects.result()
+    #                      ) / imagenet_preprocessing.NUM_IMAGES['validation']
+    # eval_accuracy = float(self.test_accuracy.result()) * \
+    #     self.flags_obj.batch_size * self.num_eval_steps / \
+    #     imagenet_preprocessing.NUM_IMAGES['validation']
+    mlp_log.mlperf_print(
+        'eval_accuracy', eval_accuracy, metadata={'epoch_num': epoch_num + 1})
+
+    first_epoch_num = max(epoch_num - self.epochs_between_evals + 1, 0)
+    epoch_count = self.epochs_between_evals
+    if first_epoch_num == 0:
+      epoch_count = self.flags_obj.eval_offset_epochs
+      if epoch_count == 0:
+        epoch_count = self.flags_obj.epochs_between_evals
+    mlp_log.mlperf_print(
+        'block_stop',
+        None,
+        metadata={
+            'first_epoch_num': first_epoch_num + 1,
+            'epoch_count': epoch_count
+        })
+
+    continue_training = True
+    if eval_accuracy >= self.flags_obj.target_accuracy:
+      continue_training = False
+    else:
+      mlp_log.mlperf_print(
+          'block_start',
+          None,
+          metadata={
+              'first_epoch_num': epoch_num + 2,
+              'epoch_count': self.epochs_between_evals
+          })
+
+    results = {}
+    if self.test_loss:
+      results['test_loss'] = self.test_loss.result()
+    if self.test_accuracy:
+      y = npu.distribute.all_reduce([self.test_accuracy.result()], "mean")[0]
+      results['test_accuracy'] = y
+      #results['test_accuracy'] = self.test_accuracy.result()
+    results['continue_training'] = continue_training
+    return results
+
+  def warmup_loop_begin(self):
+    """See base class."""
+    if self.flags_obj.trace_warmup:
+      self.trace_start(-3)
+    logging.info('Entering the warmup loop.')
+
+  def warmup_loop_end(self):
+    """See base class."""
+    if self.flags_obj.trace_warmup:
+      self.trace_end(-2)
+    # Reset the state
+    self.model.reset_states()
+    tf.keras.backend.set_value(self.optimizer.iterations, 0)
+    for accum_grad in self.accum_grads:
+      accum_grad.assign(tf.zeros_like(accum_grad,
+                                      dtype=self.accum_grads_dtype),
+                        read_value=False)
+    logging.info('Exiting the warmup loop.')
+
+  def _epoch_begin(self):
+    if self.epoch_helper.epoch_begin():
+      self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
+
+  def _epoch_end(self):
+    # mlp_log.mlperf_print('epoch_stop', None)
+    if self.epoch_helper.epoch_end():
+      self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
+
+  def trace_start(self, global_step):
+    logging.info('Starting tracing at step %d.', global_step)
+    tf.profiler.experimental.start(self.flags_obj.model_dir)
+
+  def trace_end(self, global_step):
+    logging.info('Ending trace at step %d', global_step)
+    tf.profiler.experimental.stop()
\ No newline at end of file
diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/ train_performance_1p_HW192.sh b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/ train_performance_1p_HW192.sh
new file mode 100644
index 0000000000000000000000000000000000000000..af86411c3303a641c5c5bbd234768482c04620a3
--- /dev/null
+++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/ train_performance_1p_HW192.sh	
@@ -0,0 +1,197 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+
+export RANK_SIZE=1
+export JOB_ID=10087
+RANK_ID_START=0
+
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#基础参数，需要模型审视修改
+#网络名称，同目录名称
+Network="ResNet50_ID0360_for_TensorFlow2.X"
+#训练epoch
+train_epochs=2
+#训练batch_size
+batch_size=256
+#训练step
+train_steps=`expr 1281167 / ${batch_size}`
+#学习率
+learning_rate=0.495
+
+#TF2.X独有，需要模型审视修改
+export NPU_LOOP_SIZE=${train_steps}
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="allow_mix_precision"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+# 帮助信息，不需要修改
+if [[ $1 == --help || $1 == -h ]];then
+    echo"usage:./train_performance_1P.sh <args>"
+    echo " "
+    echo "parameter explain:
+    --precision_mode         precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+    --over_dump		           if or not over detection, default is False
+    --data_dump_flag		     data dump flag, default is False
+    --data_dump_step		     data dump step, default is 10
+    --profiling		           if or not profiling for performance debug, default is False
+    --data_path		           source data of training
+    -h/--help		             show help message
+    "
+    exit 1
+fi
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    elif [[ $para == --over_dump* ]];then
+        over_dump=`echo ${para#*=}`
+        over_dump_path=${cur_path}/output/overflow_dump
+        mkdir -p ${over_dump_path}
+    elif [[ $para == --data_dump_flag* ]];then
+        data_dump_flag=`echo ${para#*=}`
+        data_dump_path=${cur_path}/output/data_dump
+        mkdir -p ${data_dump_path}
+    elif [[ $para == --data_dump_step* ]];then
+        data_dump_step=`echo ${para#*=}`
+    elif [[ $para == --profiling* ]];then
+        profiling=`echo ${para#*=}`
+        profiling_dump_path=${cur_path}/output/profiling
+        mkdir -p ${profiling_dump_path}
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/../tensorflow
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    #设置环境变量，不需要修改
+    echo "Device ID: $ASCEND_DEVICE_ID"
+    export RANK_ID=$RANK_ID
+
+
+
+    #创建DeviceID输出目录，不需要修改
+    if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+    else
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+    fi
+    #绑核，不需要绑核的模型删除，需要绑核的模型根据实际修改
+    cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'`
+    cpustep=`expr $cpucount / 8`
+    echo "taskset c steps:" $cpustep
+    let a=RANK_ID*$cpustep
+    let b=RANK_ID+1
+    let c=b*$cpustep-1
+    
+    #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+    #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path，--data_dump_flag，--data_dump_step，--data_dump_path，--profiling，--profiling_dump_path
+    nohup taskset -c $a-$c python3 resnet_ctl_imagenet_main_HW192.py \
+        --data_dir=${data_path} \
+        --num_accumulation_steps=1 \
+        --train_steps=${train_steps} \
+        --train_epochs=${train_epochs} \
+        --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \
+        --distribution_strategy=off \
+        --use_tf_while_loop=true \
+        --use_tf_function=true \
+        --enable_checkpoint_and_export \
+        --steps_per_loop=${train_steps} \
+        --base_learning_rate=${learning_rate} \
+        --momentum=0.901 \
+        --epochs_between_evals=1 \
+        --eval_offset_epochs=2 \
+        --optimizer=SGD \
+        --label_smoothing=0.1 \
+        --single_l2_loss_op \
+        --warmup_epochs=5 \
+        --weight_decay=0.000025 \
+        --lr_schedule=polynomial \
+        --drop_eval_remainder=True \
+        --precision_mode=${precision_mode} \
+        --over_dump=${over_dump} \
+        --over_dump_path=${over_dump_path} \
+        --data_dump_flag=${data_dump_flag} \
+        --data_dump_step=${data_dump_step} \
+        --data_dump_path=${data_dump_path} \
+        --batch_size=${batch_size} \
+        --profiling=${profiling} \
+        --profiling_dump_path=${profiling_dump_path}  > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done 
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep TimeHistory  $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'| sed 's/,//g' |cut -c 1-5`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep train_loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
+sed -i "/AttributeError/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log
+sed -i "/MLL/d" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log
\ No newline at end of file
diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_8p_256bs_SGD_HW192.sh b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_8p_256bs_SGD_HW192.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2b156626954a31c3b61a95d944b4d3ffd14bde51
--- /dev/null
+++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_8p_256bs_SGD_HW192.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下
+export RANK_SIZE=8
+export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json
+export JOB_ID=10087
+RANK_ID_START=0
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#基础参数 需要模型审视修改
+#网络名称，同目录名称
+Network="ResNet50_ID0360_for_TensorFlow2.X"
+#训练epoch
+train_epochs=12
+#训练batch_size
+batch_size=2048
+#训练step
+train_steps=`expr 1281167 / ${batch_size}`
+#学习率
+learning_rate=3.96
+
+#TF2.X独有，需要模型审视修改
+export NPU_LOOP_SIZE=${train_steps}
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="allow_mix_precision"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+# 帮助信息，不需要修改
+if [[ $1 == --help || $1 == -h ]];then
+    echo"usage:./train_performance_8P.sh <args>"
+    echo " "
+    echo "parameter explain:
+    --precision_mode           precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
+    --over_dump		             if or not over detection, default is False
+    --data_dump_flag		       data dump flag, default is 0
+    --data_dump_step		       data dump step, default is 10
+    --profiling		             if or not profiling for performance debug, default is False
+    --data_path		             source data of training
+    -h/--help		               show help message
+    "
+    exit 1
+fi
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    elif [[ $para == --over_dump* ]];then
+        over_dump=`echo ${para#*=}`
+        over_dump_path=${cur_path}/output/overflow_dump
+        mkdir -p ${over_dump_path}
+    elif [[ $para == --data_dump_flag* ]];then
+        data_dump_flag=`echo ${para#*=}`
+        data_dump_path=${cur_path}/output/data_dump
+        mkdir -p ${data_dump_path}
+    elif [[ $para == --data_dump_step* ]];then
+        data_dump_step=`echo ${para#*=}`
+    elif [[ $para == --profiling* ]];then
+        profiling=`echo ${para#*=}`
+        profiling_dump_path=${cur_path}/output/profiling
+        mkdir -p ${profiling_dump_path}
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --bind_core* ]]; then
+        bind_core=`echo ${para#*=}`
+        name_bind="_bindcore"
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+#执行训练，需要模型审视修改
+cd $cur_path/../tensorflow
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    #设置环境变量，不需要修改
+    echo "Device ID: $RANK_ID"
+    export RANK_ID=$RANK_ID
+    export ASCEND_DEVICE_ID=$RANK_ID
+    ASCEND_DEVICE_ID=$RANK_ID
+
+    #创建DeviceID输出目录，不需要修改
+    if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+        rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+    else
+        mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+    fi
+    
+    #绑核，不需要绑核的模型删除，需要绑核的模型根据实际修改
+    cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'`
+    cpustep=`expr $cpucount / 8`
+    echo "taskset c steps:" $cpustep
+    let a=RANK_ID*$cpustep
+    let b=RANK_ID+1
+    let c=b*$cpustep-1
+    
+    #执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+    #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path，--data_dump_flag，--data_dump_step，--data_dump_path，--profiling，--profiling_dump_path
+    if [ "x${bind_core}" != x ];then
+        bind_core="taskset -c $a-$c"
+    fi
+    nohup ${bind_core} python3 resnet_ctl_imagenet_main_HW192.py \
+        --data_dir=${data_path} \
+        --train_steps=${train_steps} \
+        --num_accumulation_steps=1 \
+        --train_epochs=${train_epochs} \
+        --model_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt \
+        --distribution_strategy=off \
+        --use_tf_while_loop=true \
+        --use_tf_function=true \
+        --enable_checkpoint_and_export \
+        --steps_per_loop=${train_steps} \
+        --base_learning_rate=${learning_rate} \
+        --momentum=0.901 \
+        --epochs_between_evals=1 \
+        --eval_offset_epochs=2 \
+        --optimizer=SGD \
+        --label_smoothing=0.1 \
+        --single_l2_loss_op \
+        --warmup_epochs=5 \
+        --weight_decay=0.000025 \
+        --lr_schedule=polynomial \
+        --drop_eval_remainder=True \
+        --precision_mode=${precision_mode} \
+        --over_dump=${over_dump} \
+        --over_dump_path=${over_dump_path} \
+        --data_dump_flag=${data_dump_flag} \
+        --data_dump_step=${data_dump_step} \
+        --data_dump_path=${data_dump_path} \
+        --batch_size=${batch_size} \
+        --profiling=${profiling} \
+        --profiling_dump_path=${profiling_dump_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done 
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep TimeHistory  $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $6}'|tail -n +7|awk '{sum+=$1} END {print sum/NR}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep eval_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v mlp_log|awk 'END {print $5}'| sed 's/,//g' |cut -c 1-5`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep train_loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+    sed -i "/AttributeError/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log
+    sed -i "/MLL/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log 
+done
\ No newline at end of file